489 files changed, 44853 insertions, 18771 deletions
diff --git a/Add-visibility-protected-attribute-for-global-variab.patch b/Add-visibility-protected-attribute-for-global-variab.patch
new file mode 100644
index 000000000..f15a73caf
--- /dev/null
+++ b/Add-visibility-protected-attribute-for-global-variab.patch
@@ -0,0 +1,77 @@
+From 0d88e15454b632d92404dd6a7181c58d9985e2a2 Mon Sep 17 00:00:00 2001
+From: Rahul Chaudhry <rahulchaudhry@google.com>
+Date: Tue, 9 May 2017 12:00:58 -0700
+Subject: [PATCH] Add visibility="protected" attribute for global variables
+ referenced in asm files.
+
+During aosp builds with binutils-2.27, we're seeing linker error
+messages of this form:
+libvpx.a(subpixel_mmx.o): relocation R_386_GOTOFF against preemptible
+symbol vp8_bilinear_filters_x86_8 cannot be used when making a shared
+object
+
+subpixel_mmx.o is assembled from "vp8/common/x86/subpixel_mmx.asm".
+Other messages refer to symbol references from deblock_sse2.o and
+subpixel_sse2.o, also assembled from asm files.
+
+This change marks such symbols as having "protected" visibility. This
+satisfies the linker as the symbols are not preemptible from outside
+the shared library now, which I think is the original intent anyway.
+
+Change-Id: I2817f7a5f43041533d65ebf41aefd63f8581a452
+---
+ vp8/common/x86/filter_x86.c | 3 ++-
+ vpx_dsp/deblock.c           | 4 ++--
+ vpx_ports/mem.h             | 6 ++++++
+ 3 files changed, 10 insertions(+), 3 deletions(-)
+
+diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
+index 2405342f0..73435a7dd 100644
+--- a/vp8/common/x86/filter_x86.c
++++ b/vp8/common/x86/filter_x86.c
+@@ -17,7 +17,8 @@ DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) = {
+   { 32, 32, 32, 32, 96, 96, 96, 96 }, { 16, 16, 16, 16, 112, 112, 112, 112 }
+ };
+ 
+-DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) = {
++DECLARE_PROTECTED(DECLARE_ALIGNED(16, const short,
++                                  vp8_bilinear_filters_x86_8[8][16])) = {
+   { 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 },
+   { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+   { 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+diff --git a/vpx_dsp/deblock.c b/vpx_dsp/deblock.c
+index a0db1e40c..3734ac251 100644
+--- a/vpx_dsp/deblock.c
++++ b/vpx_dsp/deblock.c
+@@ -10,9 +10,9 @@
+ #include <assert.h>
+ #include <stdlib.h>
+ #include "./vpx_dsp_rtcd.h"
+-#include "vpx/vpx_integer.h"
++#include "vpx_ports/mem.h"
+ 
+-const int16_t vpx_rv[] = {
++DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
+   8,  5,  2,  2,  8,  12, 4,  9,  8,  3,  0,  3,  9,  0,  0,  0,  8,  3,  14,
+   4,  10, 1,  11, 14, 1,  14, 9,  6,  12, 11, 8,  6,  10, 0,  0,  8,  9,  0,
+   3,  14, 8,  11, 13, 4,  2,  9,  0,  3,  9,  6,  1,  2,  3,  14, 13, 1,  8,
+diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
+index bfef783b1..35751cef8 100644
+--- a/vpx_ports/mem.h
++++ b/vpx_ports/mem.h
+@@ -23,6 +23,12 @@
+ #define DECLARE_ALIGNED(n, typ, val) typ val
+ #endif
+ 
++#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(_WIN32)
++#define DECLARE_PROTECTED(decl) decl __attribute__((visibility("protected")))
++#else
++#define DECLARE_PROTECTED(decl) decl
++#endif
++
+ #if HAVE_NEON && defined(_MSC_VER)
+ #define __builtin_prefetch(x)
+ #endif
+-- 
+2.15.1
+
diff --git a/Android.bp b/Android.bp
index ad834ebcb..432246f14 100644
--- a/Android.bp
+++ b/Android.bp
@@ -2,7 +2,6 @@
 // Generated from Android.bp.in, run ./generate_config.sh to regenerate
 
 libvpx_arm_neon_c_srcs = [
-    "libvpx/vp8/common/alloccommon.c",
     "libvpx/vp8/common/arm/loopfilter_arm.c",
     "libvpx/vp8/common/arm/neon/bilinearpredict_neon.c",
     "libvpx/vp8/common/arm/neon/copymem_neon.c",
@@ -19,142 +18,34 @@ libvpx_arm_neon_c_srcs = [
     "libvpx/vp8/common/arm/neon/shortidct4x4llm_neon.c",
     "libvpx/vp8/common/arm/neon/sixtappredict_neon.c",
     "libvpx/vp8/common/arm/neon/vp8_loopfilter_neon.c",
-    "libvpx/vp8/common/blockd.c",
-    "libvpx/vp8/common/copy_c.c",
-    "libvpx/vp8/common/dequantize.c",
-    "libvpx/vp8/common/entropy.c",
-    "libvpx/vp8/common/entropymode.c",
-    "libvpx/vp8/common/entropymv.c",
-    "libvpx/vp8/common/extend.c",
-    "libvpx/vp8/common/filter.c",
-    "libvpx/vp8/common/findnearmv.c",
-    "libvpx/vp8/common/generic/systemdependent.c",
-    "libvpx/vp8/common/idct_blk.c",
-    "libvpx/vp8/common/idctllm.c",
-    "libvpx/vp8/common/loopfilter_filters.c",
-    "libvpx/vp8/common/mbpitch.c",
-    "libvpx/vp8/common/modecont.c",
-    "libvpx/vp8/common/quant_common.c",
-    "libvpx/vp8/common/reconinter.c",
-    "libvpx/vp8/common/reconintra.c",
-    "libvpx/vp8/common/reconintra4x4.c",
-    "libvpx/vp8/common/rtcd.c",
-    "libvpx/vp8/common/setupintrarecon.c",
-    "libvpx/vp8/common/swapyv12buffer.c",
-    "libvpx/vp8/common/treecoder.c",
-    "libvpx/vp8/common/vp8_loopfilter.c",
-    "libvpx/vp8/decoder/dboolhuff.c",
-    "libvpx/vp8/decoder/decodeframe.c",
-    "libvpx/vp8/decoder/decodemv.c",
-    "libvpx/vp8/decoder/detokenize.c",
-    "libvpx/vp8/decoder/onyxd_if.c",
-    "libvpx/vp8/decoder/threading.c",
     "libvpx/vp8/encoder/arm/neon/denoising_neon.c",
     "libvpx/vp8/encoder/arm/neon/fastquantizeb_neon.c",
     "libvpx/vp8/encoder/arm/neon/shortfdct_neon.c",
     "libvpx/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.c",
-    "libvpx/vp8/encoder/bitstream.c",
-    "libvpx/vp8/encoder/boolhuff.c",
-    "libvpx/vp8/encoder/dct.c",
-    "libvpx/vp8/encoder/denoising.c",
-    "libvpx/vp8/encoder/encodeframe.c",
-    "libvpx/vp8/encoder/encodeintra.c",
-    "libvpx/vp8/encoder/encodemb.c",
-    "libvpx/vp8/encoder/encodemv.c",
-    "libvpx/vp8/encoder/ethreading.c",
-    "libvpx/vp8/encoder/lookahead.c",
-    "libvpx/vp8/encoder/mcomp.c",
-    "libvpx/vp8/encoder/modecosts.c",
-    "libvpx/vp8/encoder/onyx_if.c",
-    "libvpx/vp8/encoder/pickinter.c",
-    "libvpx/vp8/encoder/picklpf.c",
-    "libvpx/vp8/encoder/ratectrl.c",
-    "libvpx/vp8/encoder/rdopt.c",
-    "libvpx/vp8/encoder/segmentation.c",
-    "libvpx/vp8/encoder/tokenize.c",
-    "libvpx/vp8/encoder/treewriter.c",
-    "libvpx/vp8/encoder/vp8_quantize.c",
-    "libvpx/vp8/vp8_cx_iface.c",
-    "libvpx/vp8/vp8_dx_iface.c",
-    "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c",
-    "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c",
-    "libvpx/vp9/common/vp9_alloccommon.c",
-    "libvpx/vp9/common/vp9_blockd.c",
-    "libvpx/vp9/common/vp9_common_data.c",
-    "libvpx/vp9/common/vp9_entropy.c",
-    "libvpx/vp9/common/vp9_entropymode.c",
-    "libvpx/vp9/common/vp9_entropymv.c",
-    "libvpx/vp9/common/vp9_filter.c",
-    "libvpx/vp9/common/vp9_frame_buffers.c",
-    "libvpx/vp9/common/vp9_idct.c",
-    "libvpx/vp9/common/vp9_loopfilter.c",
-    "libvpx/vp9/common/vp9_mvref_common.c",
-    "libvpx/vp9/common/vp9_pred_common.c",
-    "libvpx/vp9/common/vp9_quant_common.c",
-    "libvpx/vp9/common/vp9_reconinter.c",
-    "libvpx/vp9/common/vp9_reconintra.c",
-    "libvpx/vp9/common/vp9_rtcd.c",
-    "libvpx/vp9/common/vp9_scale.c",
-    "libvpx/vp9/common/vp9_scan.c",
-    "libvpx/vp9/common/vp9_seg_common.c",
-    "libvpx/vp9/common/vp9_thread_common.c",
-    "libvpx/vp9/common/vp9_tile_common.c",
-    "libvpx/vp9/decoder/vp9_decodeframe.c",
-    "libvpx/vp9/decoder/vp9_decodemv.c",
-    "libvpx/vp9/decoder/vp9_decoder.c",
-    "libvpx/vp9/decoder/vp9_detokenize.c",
-    "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
-    "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
-    "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
-    "libvpx/vp9/encoder/vp9_aq_360.c",
-    "libvpx/vp9/encoder/vp9_aq_complexity.c",
-    "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
-    "libvpx/vp9/encoder/vp9_aq_variance.c",
-    "libvpx/vp9/encoder/vp9_bitstream.c",
-    "libvpx/vp9/encoder/vp9_context_tree.c",
-    "libvpx/vp9/encoder/vp9_cost.c",
-    "libvpx/vp9/encoder/vp9_dct.c",
-    "libvpx/vp9/encoder/vp9_encodeframe.c",
-    "libvpx/vp9/encoder/vp9_encodemb.c",
-    "libvpx/vp9/encoder/vp9_encodemv.c",
-    "libvpx/vp9/encoder/vp9_encoder.c",
-    "libvpx/vp9/encoder/vp9_ethread.c",
-    "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
-    "libvpx/vp9/encoder/vp9_frame_scale.c",
-    "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
-    "libvpx/vp9/encoder/vp9_mcomp.c",
-    "libvpx/vp9/encoder/vp9_multi_thread.c",
-    "libvpx/vp9/encoder/vp9_noise_estimate.c",
-    "libvpx/vp9/encoder/vp9_picklpf.c",
-    "libvpx/vp9/encoder/vp9_pickmode.c",
-    "libvpx/vp9/encoder/vp9_quantize.c",
-    "libvpx/vp9/encoder/vp9_ratectrl.c",
-    "libvpx/vp9/encoder/vp9_rd.c",
-    "libvpx/vp9/encoder/vp9_rdopt.c",
-    "libvpx/vp9/encoder/vp9_resize.c",
-    "libvpx/vp9/encoder/vp9_segmentation.c",
-    "libvpx/vp9/encoder/vp9_skin_detection.c",
-    "libvpx/vp9/encoder/vp9_speed_features.c",
-    "libvpx/vp9/encoder/vp9_subexp.c",
-    "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
-    "libvpx/vp9/encoder/vp9_tokenize.c",
-    "libvpx/vp9/encoder/vp9_treewriter.c",
-    "libvpx/vp9/vp9_cx_iface.c",
-    "libvpx/vp9/vp9_dx_iface.c",
-    "libvpx/vpx/src/vpx_codec.c",
-    "libvpx/vpx/src/vpx_decoder.c",
-    "libvpx/vpx/src/vpx_encoder.c",
-    "libvpx/vpx/src/vpx_image.c",
     "libvpx/vpx_dsp/arm/avg_neon.c",
+    "libvpx/vpx_dsp/arm/avg_pred_neon.c",
+    "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
+    "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
     "libvpx/vpx_dsp/arm/fdct_neon.c",
+    "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
     "libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
     "libvpx/vpx_dsp/arm/hadamard_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c",
     "libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
     "libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
     "libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
@@ -164,41 +55,21 @@ libvpx_arm_neon_c_srcs = [
     "libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c",
     "libvpx/vpx_dsp/arm/idct8x8_add_neon.c",
     "libvpx/vpx_dsp/arm/intrapred_neon.c",
+    "libvpx/vpx_dsp/arm/quantize_neon.c",
     "libvpx/vpx_dsp/arm/sad4d_neon.c",
     "libvpx/vpx_dsp/arm/sad_neon.c",
     "libvpx/vpx_dsp/arm/subpel_variance_neon.c",
     "libvpx/vpx_dsp/arm/subtract_neon.c",
     "libvpx/vpx_dsp/arm/variance_neon.c",
     "libvpx/vpx_dsp/arm/vpx_convolve_neon.c",
-    "libvpx/vpx_dsp/avg.c",
-    "libvpx/vpx_dsp/bitreader.c",
-    "libvpx/vpx_dsp/bitreader_buffer.c",
-    "libvpx/vpx_dsp/bitwriter.c",
-    "libvpx/vpx_dsp/bitwriter_buffer.c",
-    "libvpx/vpx_dsp/fwd_txfm.c",
-    "libvpx/vpx_dsp/intrapred.c",
-    "libvpx/vpx_dsp/inv_txfm.c",
-    "libvpx/vpx_dsp/loopfilter.c",
-    "libvpx/vpx_dsp/prob.c",
-    "libvpx/vpx_dsp/psnr.c",
-    "libvpx/vpx_dsp/quantize.c",
-    "libvpx/vpx_dsp/sad.c",
-    "libvpx/vpx_dsp/subtract.c",
-    "libvpx/vpx_dsp/sum_squares.c",
-    "libvpx/vpx_dsp/variance.c",
-    "libvpx/vpx_dsp/vpx_convolve.c",
-    "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
-    "libvpx/vpx_mem/vpx_mem.c",
-    "libvpx/vpx_ports/arm_cpudetect.c",
-    "libvpx/vpx_scale/generic/gen_scalers.c",
-    "libvpx/vpx_scale/generic/vpx_scale.c",
-    "libvpx/vpx_scale/generic/yv12config.c",
-    "libvpx/vpx_scale/generic/yv12extend.c",
-    "libvpx/vpx_scale/vpx_scale_rtcd.c",
-    "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c",
     "config/arm-neon/vpx_config.c",
 ]
 
+libvpx_arm_neon_exclude_c_srcs = [
+    "config/arm/vpx_config.c",
+]
+
 libvpx_arm_neon_asm_srcs = [
     "libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm",
     "libvpx/vpx_dsp/arm/idct4x4_add_neon.asm",
@@ -240,6 +111,7 @@ libvpx_arm_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/decoder/dboolhuff.c",
     "libvpx/vp8/decoder/decodeframe.c",
     "libvpx/vp8/decoder/decodemv.c",
@@ -295,7 +167,6 @@ libvpx_arm_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -311,10 +182,8 @@ libvpx_arm_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -330,7 +199,6 @@ libvpx_arm_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/vp9_cx_iface.c",
@@ -352,6 +220,7 @@ libvpx_arm_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -365,6 +234,7 @@ libvpx_arm_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/arm/vpx_config.c",
 ]
 
@@ -410,6 +280,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/decoder/dboolhuff.c",
     "libvpx/vp8/decoder/decodeframe.c",
     "libvpx/vp8/decoder/decodemv.c",
@@ -443,8 +314,6 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp8/encoder/vp8_quantize.c",
     "libvpx/vp8/vp8_cx_iface.c",
     "libvpx/vp8/vp8_dx_iface.c",
-    "libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c",
-    "libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c",
     "libvpx/vp9/common/vp9_alloccommon.c",
     "libvpx/vp9/common/vp9_blockd.c",
     "libvpx/vp9/common/vp9_common_data.c",
@@ -471,9 +340,8 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
-    "libvpx/vp9/encoder/arm/neon/vp9_error_neon.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
@@ -490,10 +358,8 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -509,7 +375,6 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/vp9_cx_iface.c",
@@ -519,9 +384,26 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx/src/vpx_encoder.c",
     "libvpx/vpx/src/vpx_image.c",
     "libvpx/vpx_dsp/arm/avg_neon.c",
+    "libvpx/vpx_dsp/arm/avg_pred_neon.c",
+    "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
+    "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
     "libvpx/vpx_dsp/arm/fdct_neon.c",
+    "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
     "libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
     "libvpx/vpx_dsp/arm/hadamard_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_135_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_34_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c",
     "libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c",
     "libvpx/vpx_dsp/arm/idct16x16_add_neon.c",
     "libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c",
@@ -534,6 +416,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_dsp/arm/idct8x8_add_neon.c",
     "libvpx/vpx_dsp/arm/intrapred_neon.c",
     "libvpx/vpx_dsp/arm/loopfilter_neon.c",
+    "libvpx/vpx_dsp/arm/quantize_neon.c",
     "libvpx/vpx_dsp/arm/sad4d_neon.c",
     "libvpx/vpx_dsp/arm/sad_neon.c",
     "libvpx/vpx_dsp/arm/subpel_variance_neon.c",
@@ -543,6 +426,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c",
     "libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c",
     "libvpx/vpx_dsp/arm/vpx_convolve_neon.c",
+    "libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c",
     "libvpx/vpx_dsp/avg.c",
     "libvpx/vpx_dsp/bitreader.c",
     "libvpx/vpx_dsp/bitreader_buffer.c",
@@ -556,6 +440,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -569,6 +454,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/arm64/vpx_config.c",
 ]
 
@@ -598,6 +484,7 @@ libvpx_generic_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/decoder/dboolhuff.c",
     "libvpx/vp8/decoder/decodeframe.c",
     "libvpx/vp8/decoder/decodemv.c",
@@ -653,7 +540,6 @@ libvpx_generic_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -669,10 +555,8 @@ libvpx_generic_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -688,7 +572,6 @@ libvpx_generic_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/vp9_cx_iface.c",
@@ -710,6 +593,7 @@ libvpx_generic_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -722,152 +606,17 @@ libvpx_generic_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/generic/vpx_config.c",
 ]
 
 libvpx_mips32_dspr2_c_srcs = [
-    "libvpx/vp8/common/alloccommon.c",
-    "libvpx/vp8/common/blockd.c",
-    "libvpx/vp8/common/copy_c.c",
-    "libvpx/vp8/common/dequantize.c",
-    "libvpx/vp8/common/entropy.c",
-    "libvpx/vp8/common/entropymode.c",
-    "libvpx/vp8/common/entropymv.c",
-    "libvpx/vp8/common/extend.c",
-    "libvpx/vp8/common/filter.c",
-    "libvpx/vp8/common/findnearmv.c",
-    "libvpx/vp8/common/generic/systemdependent.c",
-    "libvpx/vp8/common/idct_blk.c",
-    "libvpx/vp8/common/idctllm.c",
-    "libvpx/vp8/common/loopfilter_filters.c",
-    "libvpx/vp8/common/mbpitch.c",
     "libvpx/vp8/common/mips/dspr2/dequantize_dspr2.c",
     "libvpx/vp8/common/mips/dspr2/filter_dspr2.c",
     "libvpx/vp8/common/mips/dspr2/idct_blk_dspr2.c",
     "libvpx/vp8/common/mips/dspr2/idctllm_dspr2.c",
     "libvpx/vp8/common/mips/dspr2/reconinter_dspr2.c",
     "libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c",
-    "libvpx/vp8/common/modecont.c",
-    "libvpx/vp8/common/quant_common.c",
-    "libvpx/vp8/common/reconinter.c",
-    "libvpx/vp8/common/reconintra.c",
-    "libvpx/vp8/common/reconintra4x4.c",
-    "libvpx/vp8/common/rtcd.c",
-    "libvpx/vp8/common/setupintrarecon.c",
-    "libvpx/vp8/common/swapyv12buffer.c",
-    "libvpx/vp8/common/treecoder.c",
-    "libvpx/vp8/common/vp8_loopfilter.c",
-    "libvpx/vp8/decoder/dboolhuff.c",
-    "libvpx/vp8/decoder/decodeframe.c",
-    "libvpx/vp8/decoder/decodemv.c",
-    "libvpx/vp8/decoder/detokenize.c",
-    "libvpx/vp8/decoder/onyxd_if.c",
-    "libvpx/vp8/decoder/threading.c",
-    "libvpx/vp8/encoder/bitstream.c",
-    "libvpx/vp8/encoder/boolhuff.c",
-    "libvpx/vp8/encoder/dct.c",
-    "libvpx/vp8/encoder/denoising.c",
-    "libvpx/vp8/encoder/encodeframe.c",
-    "libvpx/vp8/encoder/encodeintra.c",
-    "libvpx/vp8/encoder/encodemb.c",
-    "libvpx/vp8/encoder/encodemv.c",
-    "libvpx/vp8/encoder/ethreading.c",
-    "libvpx/vp8/encoder/lookahead.c",
-    "libvpx/vp8/encoder/mcomp.c",
-    "libvpx/vp8/encoder/modecosts.c",
-    "libvpx/vp8/encoder/onyx_if.c",
-    "libvpx/vp8/encoder/pickinter.c",
-    "libvpx/vp8/encoder/picklpf.c",
-    "libvpx/vp8/encoder/ratectrl.c",
-    "libvpx/vp8/encoder/rdopt.c",
-    "libvpx/vp8/encoder/segmentation.c",
-    "libvpx/vp8/encoder/tokenize.c",
-    "libvpx/vp8/encoder/treewriter.c",
-    "libvpx/vp8/encoder/vp8_quantize.c",
-    "libvpx/vp8/vp8_cx_iface.c",
-    "libvpx/vp8/vp8_dx_iface.c",
-    "libvpx/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c",
-    "libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c",
-    "libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c",
-    "libvpx/vp9/common/vp9_alloccommon.c",
-    "libvpx/vp9/common/vp9_blockd.c",
-    "libvpx/vp9/common/vp9_common_data.c",
-    "libvpx/vp9/common/vp9_entropy.c",
-    "libvpx/vp9/common/vp9_entropymode.c",
-    "libvpx/vp9/common/vp9_entropymv.c",
-    "libvpx/vp9/common/vp9_filter.c",
-    "libvpx/vp9/common/vp9_frame_buffers.c",
-    "libvpx/vp9/common/vp9_idct.c",
-    "libvpx/vp9/common/vp9_loopfilter.c",
-    "libvpx/vp9/common/vp9_mvref_common.c",
-    "libvpx/vp9/common/vp9_pred_common.c",
-    "libvpx/vp9/common/vp9_quant_common.c",
-    "libvpx/vp9/common/vp9_reconinter.c",
-    "libvpx/vp9/common/vp9_reconintra.c",
-    "libvpx/vp9/common/vp9_rtcd.c",
-    "libvpx/vp9/common/vp9_scale.c",
-    "libvpx/vp9/common/vp9_scan.c",
-    "libvpx/vp9/common/vp9_seg_common.c",
-    "libvpx/vp9/common/vp9_thread_common.c",
-    "libvpx/vp9/common/vp9_tile_common.c",
-    "libvpx/vp9/decoder/vp9_decodeframe.c",
-    "libvpx/vp9/decoder/vp9_decodemv.c",
-    "libvpx/vp9/decoder/vp9_decoder.c",
-    "libvpx/vp9/decoder/vp9_detokenize.c",
-    "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
-    "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
-    "libvpx/vp9/encoder/vp9_aq_360.c",
-    "libvpx/vp9/encoder/vp9_aq_complexity.c",
-    "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
-    "libvpx/vp9/encoder/vp9_aq_variance.c",
-    "libvpx/vp9/encoder/vp9_bitstream.c",
-    "libvpx/vp9/encoder/vp9_context_tree.c",
-    "libvpx/vp9/encoder/vp9_cost.c",
-    "libvpx/vp9/encoder/vp9_dct.c",
-    "libvpx/vp9/encoder/vp9_encodeframe.c",
-    "libvpx/vp9/encoder/vp9_encodemb.c",
-    "libvpx/vp9/encoder/vp9_encodemv.c",
-    "libvpx/vp9/encoder/vp9_encoder.c",
-    "libvpx/vp9/encoder/vp9_ethread.c",
-    "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
-    "libvpx/vp9/encoder/vp9_frame_scale.c",
-    "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
-    "libvpx/vp9/encoder/vp9_mcomp.c",
-    "libvpx/vp9/encoder/vp9_multi_thread.c",
-    "libvpx/vp9/encoder/vp9_noise_estimate.c",
-    "libvpx/vp9/encoder/vp9_picklpf.c",
-    "libvpx/vp9/encoder/vp9_pickmode.c",
-    "libvpx/vp9/encoder/vp9_quantize.c",
-    "libvpx/vp9/encoder/vp9_ratectrl.c",
-    "libvpx/vp9/encoder/vp9_rd.c",
-    "libvpx/vp9/encoder/vp9_rdopt.c",
-    "libvpx/vp9/encoder/vp9_resize.c",
-    "libvpx/vp9/encoder/vp9_segmentation.c",
-    "libvpx/vp9/encoder/vp9_skin_detection.c",
-    "libvpx/vp9/encoder/vp9_speed_features.c",
-    "libvpx/vp9/encoder/vp9_subexp.c",
-    "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
-    "libvpx/vp9/encoder/vp9_tokenize.c",
-    "libvpx/vp9/encoder/vp9_treewriter.c",
-    "libvpx/vp9/vp9_cx_iface.c",
-    "libvpx/vp9/vp9_dx_iface.c",
-    "libvpx/vpx/src/vpx_codec.c",
-    "libvpx/vpx/src/vpx_decoder.c",
-    "libvpx/vpx/src/vpx_encoder.c",
-    "libvpx/vpx/src/vpx_image.c",
-    "libvpx/vpx_dsp/avg.c",
-    "libvpx/vpx_dsp/bitreader.c",
-    "libvpx/vpx_dsp/bitreader_buffer.c",
-    "libvpx/vpx_dsp/bitwriter.c",
-    "libvpx/vpx_dsp/bitwriter_buffer.c",
-    "libvpx/vpx_dsp/fwd_txfm.c",
-    "libvpx/vpx_dsp/intrapred.c",
-    "libvpx/vpx_dsp/inv_txfm.c",
-    "libvpx/vpx_dsp/loopfilter.c",
     "libvpx/vpx_dsp/mips/common_dspr2.c",
     "libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c",
     "libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c",
@@ -882,192 +631,38 @@ libvpx_mips32_dspr2_c_srcs = [
     "libvpx/vpx_dsp/mips/intrapred16_dspr2.c",
     "libvpx/vpx_dsp/mips/intrapred4_dspr2.c",
     "libvpx/vpx_dsp/mips/intrapred8_dspr2.c",
-    "libvpx/vpx_dsp/mips/itrans16_dspr2.c",
-    "libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c",
-    "libvpx/vpx_dsp/mips/itrans32_dspr2.c",
-    "libvpx/vpx_dsp/mips/itrans4_dspr2.c",
-    "libvpx/vpx_dsp/mips/itrans8_dspr2.c",
     "libvpx/vpx_dsp/mips/loopfilter_filters_dspr2.c",
     "libvpx/vpx_dsp/mips/loopfilter_mb_dspr2.c",
     "libvpx/vpx_dsp/mips/loopfilter_mb_horiz_dspr2.c",
     "libvpx/vpx_dsp/mips/loopfilter_mb_vert_dspr2.c",
-    "libvpx/vpx_dsp/prob.c",
-    "libvpx/vpx_dsp/psnr.c",
-    "libvpx/vpx_dsp/quantize.c",
-    "libvpx/vpx_dsp/sad.c",
-    "libvpx/vpx_dsp/subtract.c",
-    "libvpx/vpx_dsp/sum_squares.c",
-    "libvpx/vpx_dsp/variance.c",
-    "libvpx/vpx_dsp/vpx_convolve.c",
-    "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
-    "libvpx/vpx_mem/vpx_mem.c",
-    "libvpx/vpx_scale/generic/gen_scalers.c",
-    "libvpx/vpx_scale/generic/vpx_scale.c",
-    "libvpx/vpx_scale/generic/yv12config.c",
-    "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/mips/dspr2/yv12extend_dspr2.c",
-    "libvpx/vpx_scale/vpx_scale_rtcd.c",
-    "libvpx/vpx_util/vpx_thread.c",
     "config/mips32-dspr2/vpx_config.c",
 ]
 
+libvpx_mips32_dspr2_exclude_c_srcs = [
+    "config/mips32/vpx_config.c",
+]
+
 libvpx_mips32_msa_c_srcs = [
-    "libvpx/vp8/common/alloccommon.c",
-    "libvpx/vp8/common/blockd.c",
-    "libvpx/vp8/common/copy_c.c",
-    "libvpx/vp8/common/dequantize.c",
-    "libvpx/vp8/common/entropy.c",
-    "libvpx/vp8/common/entropymode.c",
-    "libvpx/vp8/common/entropymv.c",
-    "libvpx/vp8/common/extend.c",
-    "libvpx/vp8/common/filter.c",
-    "libvpx/vp8/common/findnearmv.c",
-    "libvpx/vp8/common/generic/systemdependent.c",
-    "libvpx/vp8/common/idct_blk.c",
-    "libvpx/vp8/common/idctllm.c",
-    "libvpx/vp8/common/loopfilter_filters.c",
-    "libvpx/vp8/common/mbpitch.c",
     "libvpx/vp8/common/mips/msa/bilinear_filter_msa.c",
     "libvpx/vp8/common/mips/msa/copymem_msa.c",
     "libvpx/vp8/common/mips/msa/idct_msa.c",
     "libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c",
     "libvpx/vp8/common/mips/msa/sixtap_filter_msa.c",
-    "libvpx/vp8/common/modecont.c",
-    "libvpx/vp8/common/quant_common.c",
-    "libvpx/vp8/common/reconinter.c",
-    "libvpx/vp8/common/reconintra.c",
-    "libvpx/vp8/common/reconintra4x4.c",
-    "libvpx/vp8/common/rtcd.c",
-    "libvpx/vp8/common/setupintrarecon.c",
-    "libvpx/vp8/common/swapyv12buffer.c",
-    "libvpx/vp8/common/treecoder.c",
-    "libvpx/vp8/common/vp8_loopfilter.c",
-    "libvpx/vp8/decoder/dboolhuff.c",
-    "libvpx/vp8/decoder/decodeframe.c",
-    "libvpx/vp8/decoder/decodemv.c",
-    "libvpx/vp8/decoder/detokenize.c",
-    "libvpx/vp8/decoder/onyxd_if.c",
-    "libvpx/vp8/decoder/threading.c",
-    "libvpx/vp8/encoder/bitstream.c",
-    "libvpx/vp8/encoder/boolhuff.c",
-    "libvpx/vp8/encoder/dct.c",
-    "libvpx/vp8/encoder/denoising.c",
-    "libvpx/vp8/encoder/encodeframe.c",
-    "libvpx/vp8/encoder/encodeintra.c",
-    "libvpx/vp8/encoder/encodemb.c",
-    "libvpx/vp8/encoder/encodemv.c",
-    "libvpx/vp8/encoder/ethreading.c",
-    "libvpx/vp8/encoder/lookahead.c",
-    "libvpx/vp8/encoder/mcomp.c",
     "libvpx/vp8/encoder/mips/msa/dct_msa.c",
     "libvpx/vp8/encoder/mips/msa/denoising_msa.c",
     "libvpx/vp8/encoder/mips/msa/encodeopt_msa.c",
     "libvpx/vp8/encoder/mips/msa/quantize_msa.c",
-    "libvpx/vp8/encoder/modecosts.c",
-    "libvpx/vp8/encoder/onyx_if.c",
-    "libvpx/vp8/encoder/pickinter.c",
-    "libvpx/vp8/encoder/picklpf.c",
-    "libvpx/vp8/encoder/ratectrl.c",
-    "libvpx/vp8/encoder/rdopt.c",
-    "libvpx/vp8/encoder/segmentation.c",
-    "libvpx/vp8/encoder/tokenize.c",
-    "libvpx/vp8/encoder/treewriter.c",
-    "libvpx/vp8/encoder/vp8_quantize.c",
-    "libvpx/vp8/vp8_cx_iface.c",
-    "libvpx/vp8/vp8_dx_iface.c",
     "libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c",
     "libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c",
     "libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c",
-    "libvpx/vp9/common/vp9_alloccommon.c",
-    "libvpx/vp9/common/vp9_blockd.c",
-    "libvpx/vp9/common/vp9_common_data.c",
-    "libvpx/vp9/common/vp9_entropy.c",
-    "libvpx/vp9/common/vp9_entropymode.c",
-    "libvpx/vp9/common/vp9_entropymv.c",
-    "libvpx/vp9/common/vp9_filter.c",
-    "libvpx/vp9/common/vp9_frame_buffers.c",
-    "libvpx/vp9/common/vp9_idct.c",
-    "libvpx/vp9/common/vp9_loopfilter.c",
-    "libvpx/vp9/common/vp9_mvref_common.c",
-    "libvpx/vp9/common/vp9_pred_common.c",
-    "libvpx/vp9/common/vp9_quant_common.c",
-    "libvpx/vp9/common/vp9_reconinter.c",
-    "libvpx/vp9/common/vp9_reconintra.c",
-    "libvpx/vp9/common/vp9_rtcd.c",
-    "libvpx/vp9/common/vp9_scale.c",
-    "libvpx/vp9/common/vp9_scan.c",
-    "libvpx/vp9/common/vp9_seg_common.c",
-    "libvpx/vp9/common/vp9_thread_common.c",
-    "libvpx/vp9/common/vp9_tile_common.c",
-    "libvpx/vp9/decoder/vp9_decodeframe.c",
-    "libvpx/vp9/decoder/vp9_decodemv.c",
-    "libvpx/vp9/decoder/vp9_decoder.c",
-    "libvpx/vp9/decoder/vp9_detokenize.c",
-    "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/mips/msa/vp9_error_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c",
-    "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
-    "libvpx/vp9/encoder/vp9_aq_360.c",
-    "libvpx/vp9/encoder/vp9_aq_complexity.c",
-    "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
-    "libvpx/vp9/encoder/vp9_aq_variance.c",
-    "libvpx/vp9/encoder/vp9_bitstream.c",
-    "libvpx/vp9/encoder/vp9_context_tree.c",
-    "libvpx/vp9/encoder/vp9_cost.c",
-    "libvpx/vp9/encoder/vp9_dct.c",
-    "libvpx/vp9/encoder/vp9_encodeframe.c",
-    "libvpx/vp9/encoder/vp9_encodemb.c",
-    "libvpx/vp9/encoder/vp9_encodemv.c",
-    "libvpx/vp9/encoder/vp9_encoder.c",
-    "libvpx/vp9/encoder/vp9_ethread.c",
-    "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
-    "libvpx/vp9/encoder/vp9_frame_scale.c",
-    "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
-    "libvpx/vp9/encoder/vp9_mcomp.c",
-    "libvpx/vp9/encoder/vp9_multi_thread.c",
-    "libvpx/vp9/encoder/vp9_noise_estimate.c",
-    "libvpx/vp9/encoder/vp9_picklpf.c",
-    "libvpx/vp9/encoder/vp9_pickmode.c",
-    "libvpx/vp9/encoder/vp9_quantize.c",
-    "libvpx/vp9/encoder/vp9_ratectrl.c",
-    "libvpx/vp9/encoder/vp9_rd.c",
-    "libvpx/vp9/encoder/vp9_rdopt.c",
-    "libvpx/vp9/encoder/vp9_resize.c",
-    "libvpx/vp9/encoder/vp9_segmentation.c",
-    "libvpx/vp9/encoder/vp9_skin_detection.c",
-    "libvpx/vp9/encoder/vp9_speed_features.c",
-    "libvpx/vp9/encoder/vp9_subexp.c",
-    "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
-    "libvpx/vp9/encoder/vp9_tokenize.c",
-    "libvpx/vp9/encoder/vp9_treewriter.c",
-    "libvpx/vp9/vp9_cx_iface.c",
-    "libvpx/vp9/vp9_dx_iface.c",
-    "libvpx/vpx/src/vpx_codec.c",
-    "libvpx/vpx/src/vpx_decoder.c",
-    "libvpx/vpx/src/vpx_encoder.c",
-    "libvpx/vpx/src/vpx_image.c",
-    "libvpx/vpx_dsp/avg.c",
-    "libvpx/vpx_dsp/bitreader.c",
-    "libvpx/vpx_dsp/bitreader_buffer.c",
-    "libvpx/vpx_dsp/bitwriter.c",
-    "libvpx/vpx_dsp/bitwriter_buffer.c",
-    "libvpx/vpx_dsp/fwd_txfm.c",
-    "libvpx/vpx_dsp/intrapred.c",
-    "libvpx/vpx_dsp/inv_txfm.c",
-    "libvpx/vpx_dsp/loopfilter.c",
     "libvpx/vpx_dsp/mips/avg_msa.c",
     "libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c",
     "libvpx/vpx_dsp/mips/fwd_txfm_msa.c",
-    "libvpx/vpx_dsp/mips/idct16x16_msa.c",
-    "libvpx/vpx_dsp/mips/idct32x32_msa.c",
-    "libvpx/vpx_dsp/mips/idct4x4_msa.c",
-    "libvpx/vpx_dsp/mips/idct8x8_msa.c",
     "libvpx/vpx_dsp/mips/intrapred_msa.c",
     "libvpx/vpx_dsp/mips/loopfilter_16_msa.c",
     "libvpx/vpx_dsp/mips/loopfilter_4_msa.c",
@@ -1085,25 +680,13 @@ libvpx_mips32_msa_c_srcs = [
     "libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c",
     "libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c",
     "libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c",
-    "libvpx/vpx_dsp/prob.c",
-    "libvpx/vpx_dsp/psnr.c",
-    "libvpx/vpx_dsp/quantize.c",
-    "libvpx/vpx_dsp/sad.c",
-    "libvpx/vpx_dsp/subtract.c",
-    "libvpx/vpx_dsp/sum_squares.c",
-    "libvpx/vpx_dsp/variance.c",
-    "libvpx/vpx_dsp/vpx_convolve.c",
-    "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
-    "libvpx/vpx_mem/vpx_mem.c",
-    "libvpx/vpx_scale/generic/gen_scalers.c",
-    "libvpx/vpx_scale/generic/vpx_scale.c",
-    "libvpx/vpx_scale/generic/yv12config.c",
-    "libvpx/vpx_scale/generic/yv12extend.c",
-    "libvpx/vpx_scale/vpx_scale_rtcd.c",
-    "libvpx/vpx_util/vpx_thread.c",
     "config/mips32-msa/vpx_config.c",
 ]
 
+libvpx_mips32_msa_exclude_c_srcs = [
+    "config/mips32/vpx_config.c",
+]
+
 libvpx_mips32_c_srcs = [
     "libvpx/vp8/common/alloccommon.c",
     "libvpx/vp8/common/blockd.c",
@@ -1130,6 +713,7 @@ libvpx_mips32_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/decoder/dboolhuff.c",
     "libvpx/vp8/decoder/decodeframe.c",
     "libvpx/vp8/decoder/decodemv.c",
@@ -1185,7 +769,6 @@ libvpx_mips32_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1201,10 +784,8 @@ libvpx_mips32_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1220,7 +801,6 @@ libvpx_mips32_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/vp9_cx_iface.c",
@@ -1242,6 +822,7 @@ libvpx_mips32_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -1254,166 +835,30 @@ libvpx_mips32_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/mips32/vpx_config.c",
 ]
 
 libvpx_mips64_msa_c_srcs = [
-    "libvpx/vp8/common/alloccommon.c",
-    "libvpx/vp8/common/blockd.c",
-    "libvpx/vp8/common/copy_c.c",
-    "libvpx/vp8/common/dequantize.c",
-    "libvpx/vp8/common/entropy.c",
-    "libvpx/vp8/common/entropymode.c",
-    "libvpx/vp8/common/entropymv.c",
-    "libvpx/vp8/common/extend.c",
-    "libvpx/vp8/common/filter.c",
-    "libvpx/vp8/common/findnearmv.c",
-    "libvpx/vp8/common/generic/systemdependent.c",
-    "libvpx/vp8/common/idct_blk.c",
-    "libvpx/vp8/common/idctllm.c",
-    "libvpx/vp8/common/loopfilter_filters.c",
-    "libvpx/vp8/common/mbpitch.c",
     "libvpx/vp8/common/mips/msa/bilinear_filter_msa.c",
     "libvpx/vp8/common/mips/msa/copymem_msa.c",
     "libvpx/vp8/common/mips/msa/idct_msa.c",
     "libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c",
     "libvpx/vp8/common/mips/msa/sixtap_filter_msa.c",
-    "libvpx/vp8/common/modecont.c",
-    "libvpx/vp8/common/quant_common.c",
-    "libvpx/vp8/common/reconinter.c",
-    "libvpx/vp8/common/reconintra.c",
-    "libvpx/vp8/common/reconintra4x4.c",
-    "libvpx/vp8/common/rtcd.c",
-    "libvpx/vp8/common/setupintrarecon.c",
-    "libvpx/vp8/common/swapyv12buffer.c",
-    "libvpx/vp8/common/treecoder.c",
-    "libvpx/vp8/common/vp8_loopfilter.c",
-    "libvpx/vp8/decoder/dboolhuff.c",
-    "libvpx/vp8/decoder/decodeframe.c",
-    "libvpx/vp8/decoder/decodemv.c",
-    "libvpx/vp8/decoder/detokenize.c",
-    "libvpx/vp8/decoder/onyxd_if.c",
-    "libvpx/vp8/decoder/threading.c",
-    "libvpx/vp8/encoder/bitstream.c",
-    "libvpx/vp8/encoder/boolhuff.c",
-    "libvpx/vp8/encoder/dct.c",
-    "libvpx/vp8/encoder/denoising.c",
-    "libvpx/vp8/encoder/encodeframe.c",
-    "libvpx/vp8/encoder/encodeintra.c",
-    "libvpx/vp8/encoder/encodemb.c",
-    "libvpx/vp8/encoder/encodemv.c",
-    "libvpx/vp8/encoder/ethreading.c",
-    "libvpx/vp8/encoder/lookahead.c",
-    "libvpx/vp8/encoder/mcomp.c",
     "libvpx/vp8/encoder/mips/msa/dct_msa.c",
     "libvpx/vp8/encoder/mips/msa/denoising_msa.c",
     "libvpx/vp8/encoder/mips/msa/encodeopt_msa.c",
     "libvpx/vp8/encoder/mips/msa/quantize_msa.c",
-    "libvpx/vp8/encoder/modecosts.c",
-    "libvpx/vp8/encoder/onyx_if.c",
-    "libvpx/vp8/encoder/pickinter.c",
-    "libvpx/vp8/encoder/picklpf.c",
-    "libvpx/vp8/encoder/ratectrl.c",
-    "libvpx/vp8/encoder/rdopt.c",
-    "libvpx/vp8/encoder/segmentation.c",
-    "libvpx/vp8/encoder/tokenize.c",
-    "libvpx/vp8/encoder/treewriter.c",
-    "libvpx/vp8/encoder/vp8_quantize.c",
-    "libvpx/vp8/vp8_cx_iface.c",
-    "libvpx/vp8/vp8_dx_iface.c",
     "libvpx/vp9/common/mips/msa/vp9_idct16x16_msa.c",
     "libvpx/vp9/common/mips/msa/vp9_idct4x4_msa.c",
     "libvpx/vp9/common/mips/msa/vp9_idct8x8_msa.c",
-    "libvpx/vp9/common/vp9_alloccommon.c",
-    "libvpx/vp9/common/vp9_blockd.c",
-    "libvpx/vp9/common/vp9_common_data.c",
-    "libvpx/vp9/common/vp9_entropy.c",
-    "libvpx/vp9/common/vp9_entropymode.c",
-    "libvpx/vp9/common/vp9_entropymv.c",
-    "libvpx/vp9/common/vp9_filter.c",
-    "libvpx/vp9/common/vp9_frame_buffers.c",
-    "libvpx/vp9/common/vp9_idct.c",
-    "libvpx/vp9/common/vp9_loopfilter.c",
-    "libvpx/vp9/common/vp9_mvref_common.c",
-    "libvpx/vp9/common/vp9_pred_common.c",
-    "libvpx/vp9/common/vp9_quant_common.c",
-    "libvpx/vp9/common/vp9_reconinter.c",
-    "libvpx/vp9/common/vp9_reconintra.c",
-    "libvpx/vp9/common/vp9_rtcd.c",
-    "libvpx/vp9/common/vp9_scale.c",
-    "libvpx/vp9/common/vp9_scan.c",
-    "libvpx/vp9/common/vp9_seg_common.c",
-    "libvpx/vp9/common/vp9_thread_common.c",
-    "libvpx/vp9/common/vp9_tile_common.c",
-    "libvpx/vp9/decoder/vp9_decodeframe.c",
-    "libvpx/vp9/decoder/vp9_decodemv.c",
-    "libvpx/vp9/decoder/vp9_decoder.c",
-    "libvpx/vp9/decoder/vp9_detokenize.c",
-    "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/mips/msa/vp9_error_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct16x16_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct4x4_msa.c",
     "libvpx/vp9/encoder/mips/msa/vp9_fdct8x8_msa.c",
-    "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
-    "libvpx/vp9/encoder/vp9_aq_360.c",
-    "libvpx/vp9/encoder/vp9_aq_complexity.c",
-    "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
-    "libvpx/vp9/encoder/vp9_aq_variance.c",
-    "libvpx/vp9/encoder/vp9_bitstream.c",
-    "libvpx/vp9/encoder/vp9_context_tree.c",
-    "libvpx/vp9/encoder/vp9_cost.c",
-    "libvpx/vp9/encoder/vp9_dct.c",
-    "libvpx/vp9/encoder/vp9_encodeframe.c",
-    "libvpx/vp9/encoder/vp9_encodemb.c",
-    "libvpx/vp9/encoder/vp9_encodemv.c",
-    "libvpx/vp9/encoder/vp9_encoder.c",
-    "libvpx/vp9/encoder/vp9_ethread.c",
-    "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
-    "libvpx/vp9/encoder/vp9_frame_scale.c",
-    "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
-    "libvpx/vp9/encoder/vp9_mcomp.c",
-    "libvpx/vp9/encoder/vp9_multi_thread.c",
-    "libvpx/vp9/encoder/vp9_noise_estimate.c",
-    "libvpx/vp9/encoder/vp9_picklpf.c",
-    "libvpx/vp9/encoder/vp9_pickmode.c",
-    "libvpx/vp9/encoder/vp9_quantize.c",
-    "libvpx/vp9/encoder/vp9_ratectrl.c",
-    "libvpx/vp9/encoder/vp9_rd.c",
-    "libvpx/vp9/encoder/vp9_rdopt.c",
-    "libvpx/vp9/encoder/vp9_resize.c",
-    "libvpx/vp9/encoder/vp9_segmentation.c",
-    "libvpx/vp9/encoder/vp9_skin_detection.c",
-    "libvpx/vp9/encoder/vp9_speed_features.c",
-    "libvpx/vp9/encoder/vp9_subexp.c",
-    "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
-    "libvpx/vp9/encoder/vp9_tokenize.c",
-    "libvpx/vp9/encoder/vp9_treewriter.c",
-    "libvpx/vp9/vp9_cx_iface.c",
-    "libvpx/vp9/vp9_dx_iface.c",
-    "libvpx/vpx/src/vpx_codec.c",
-    "libvpx/vpx/src/vpx_decoder.c",
-    "libvpx/vpx/src/vpx_encoder.c",
-    "libvpx/vpx/src/vpx_image.c",
-    "libvpx/vpx_dsp/avg.c",
-    "libvpx/vpx_dsp/bitreader.c",
-    "libvpx/vpx_dsp/bitreader_buffer.c",
-    "libvpx/vpx_dsp/bitwriter.c",
-    "libvpx/vpx_dsp/bitwriter_buffer.c",
-    "libvpx/vpx_dsp/fwd_txfm.c",
-    "libvpx/vpx_dsp/intrapred.c",
-    "libvpx/vpx_dsp/inv_txfm.c",
-    "libvpx/vpx_dsp/loopfilter.c",
     "libvpx/vpx_dsp/mips/avg_msa.c",
     "libvpx/vpx_dsp/mips/fwd_dct32x32_msa.c",
     "libvpx/vpx_dsp/mips/fwd_txfm_msa.c",
-    "libvpx/vpx_dsp/mips/idct16x16_msa.c",
-    "libvpx/vpx_dsp/mips/idct32x32_msa.c",
-    "libvpx/vpx_dsp/mips/idct4x4_msa.c",
-    "libvpx/vpx_dsp/mips/idct8x8_msa.c",
     "libvpx/vpx_dsp/mips/intrapred_msa.c",
     "libvpx/vpx_dsp/mips/loopfilter_16_msa.c",
     "libvpx/vpx_dsp/mips/loopfilter_4_msa.c",
@@ -1431,25 +876,13 @@ libvpx_mips64_msa_c_srcs = [
     "libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c",
     "libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c",
     "libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c",
-    "libvpx/vpx_dsp/prob.c",
-    "libvpx/vpx_dsp/psnr.c",
-    "libvpx/vpx_dsp/quantize.c",
-    "libvpx/vpx_dsp/sad.c",
-    "libvpx/vpx_dsp/subtract.c",
-    "libvpx/vpx_dsp/sum_squares.c",
-    "libvpx/vpx_dsp/variance.c",
-    "libvpx/vpx_dsp/vpx_convolve.c",
-    "libvpx/vpx_dsp/vpx_dsp_rtcd.c",
-    "libvpx/vpx_mem/vpx_mem.c",
-    "libvpx/vpx_scale/generic/gen_scalers.c",
-    "libvpx/vpx_scale/generic/vpx_scale.c",
-    "libvpx/vpx_scale/generic/yv12config.c",
-    "libvpx/vpx_scale/generic/yv12extend.c",
-    "libvpx/vpx_scale/vpx_scale_rtcd.c",
-    "libvpx/vpx_util/vpx_thread.c",
     "config/mips64-msa/vpx_config.c",
 ]
 
+libvpx_mips64_msa_exclude_c_srcs = [
+    "config/mips64/vpx_config.c",
+]
+
 libvpx_mips64_c_srcs = [
     "libvpx/vp8/common/alloccommon.c",
     "libvpx/vp8/common/blockd.c",
@@ -1476,6 +909,7 @@ libvpx_mips64_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/decoder/dboolhuff.c",
     "libvpx/vp8/decoder/decodeframe.c",
     "libvpx/vp8/decoder/decodemv.c",
@@ -1531,7 +965,6 @@ libvpx_mips64_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1547,10 +980,8 @@ libvpx_mips64_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1566,7 +997,6 @@ libvpx_mips64_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/vp9_cx_iface.c",
@@ -1588,6 +1018,7 @@ libvpx_mips64_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -1600,6 +1031,7 @@ libvpx_mips64_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/mips64/vpx_config.c",
 ]
 
@@ -1631,6 +1063,7 @@ libvpx_x86_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/common/x86/filter_x86.c",
     "libvpx/vp8/common/x86/idct_blk_mmx.c",
     "libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -1664,10 +1097,9 @@ libvpx_x86_c_srcs = [
     "libvpx/vp8/encoder/treewriter.c",
     "libvpx/vp8/encoder/vp8_quantize.c",
     "libvpx/vp8/encoder/x86/denoising_sse2.c",
-    "libvpx/vp8/encoder/x86/quantize_ssse3.c",
-    "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
     "libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
     "libvpx/vp8/encoder/x86/vp8_quantize_sse2.c",
+    "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c",
     "libvpx/vp8/vp8_cx_iface.c",
     "libvpx/vp8/vp8_dx_iface.c",
     "libvpx/vp9/common/vp9_alloccommon.c",
@@ -1697,7 +1129,6 @@ libvpx_x86_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1713,10 +1144,8 @@ libvpx_x86_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1732,12 +1161,12 @@ libvpx_x86_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_dct_ssse3.c",
     "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
+    "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
     "libvpx/vp9/vp9_cx_iface.c",
     "libvpx/vp9/vp9_dx_iface.c",
@@ -1760,6 +1189,7 @@ libvpx_x86_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -1768,10 +1198,20 @@ libvpx_x86_c_srcs = [
     "libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
     "libvpx/vpx_dsp/x86/avg_pred_sse2.c",
     "libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c",
+    "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
     "libvpx/vpx_dsp/x86/inv_txfm_sse2.c",
     "libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
     "libvpx/vpx_dsp/x86/loopfilter_sse2.c",
     "libvpx/vpx_dsp/x86/quantize_sse2.c",
+    "libvpx/vpx_dsp/x86/quantize_ssse3.c",
     "libvpx/vpx_dsp/x86/sum_squares_sse2.c",
     "libvpx/vpx_dsp/x86/variance_sse2.c",
     "libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
@@ -1783,6 +1223,7 @@ libvpx_x86_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/x86/vpx_config.c",
 ]
 
@@ -1803,11 +1244,15 @@ libvpx_x86_asm_srcs = [
     "libvpx/vp8/encoder/x86/dct_sse2.asm",
     "libvpx/vp8/encoder/x86/encodeopt.asm",
     "libvpx/vp8/encoder/x86/fwalsh_sse2.asm",
-    "libvpx/vp8/encoder/x86/quantize_mmx.asm",
     "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
     "libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
     "libvpx/vpx_dsp/x86/add_noise_sse2.asm",
     "libvpx/vpx_dsp/x86/deblock_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm",
     "libvpx/vpx_dsp/x86/intrapred_sse2.asm",
     "libvpx/vpx_dsp/x86/intrapred_ssse3.asm",
     "libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
@@ -1818,6 +1263,8 @@ libvpx_x86_asm_srcs = [
     "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
     "libvpx/vpx_dsp/x86/subtract_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm",
+    "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm",
+    "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm",
@@ -1854,6 +1301,7 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp8/common/swapyv12buffer.c",
     "libvpx/vp8/common/treecoder.c",
     "libvpx/vp8/common/vp8_loopfilter.c",
+    "libvpx/vp8/common/vp8_skin_detection.c",
     "libvpx/vp8/common/x86/filter_x86.c",
     "libvpx/vp8/common/x86/idct_blk_mmx.c",
     "libvpx/vp8/common/x86/idct_blk_sse2.c",
@@ -1887,10 +1335,9 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp8/encoder/treewriter.c",
     "libvpx/vp8/encoder/vp8_quantize.c",
     "libvpx/vp8/encoder/x86/denoising_sse2.c",
-    "libvpx/vp8/encoder/x86/quantize_ssse3.c",
-    "libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c",
     "libvpx/vp8/encoder/x86/vp8_enc_stubs_sse2.c",
     "libvpx/vp8/encoder/x86/vp8_quantize_sse2.c",
+    "libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c",
     "libvpx/vp8/vp8_cx_iface.c",
     "libvpx/vp8/vp8_dx_iface.c",
     "libvpx/vp9/common/vp9_alloccommon.c",
@@ -1920,7 +1367,6 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp9/decoder/vp9_decoder.c",
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
-    "libvpx/vp9/decoder/vp9_dthread.c",
     "libvpx/vp9/encoder/vp9_alt_ref_aq.c",
     "libvpx/vp9/encoder/vp9_aq_360.c",
     "libvpx/vp9/encoder/vp9_aq_complexity.c",
@@ -1936,10 +1382,8 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp9/encoder/vp9_encoder.c",
     "libvpx/vp9/encoder/vp9_ethread.c",
     "libvpx/vp9/encoder/vp9_extend.c",
-    "libvpx/vp9/encoder/vp9_firstpass.c",
     "libvpx/vp9/encoder/vp9_frame_scale.c",
     "libvpx/vp9/encoder/vp9_lookahead.c",
-    "libvpx/vp9/encoder/vp9_mbgraph.c",
     "libvpx/vp9/encoder/vp9_mcomp.c",
     "libvpx/vp9/encoder/vp9_multi_thread.c",
     "libvpx/vp9/encoder/vp9_noise_estimate.c",
@@ -1955,12 +1399,12 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp9/encoder/vp9_speed_features.c",
     "libvpx/vp9/encoder/vp9_subexp.c",
     "libvpx/vp9/encoder/vp9_svc_layercontext.c",
-    "libvpx/vp9/encoder/vp9_temporal_filter.c",
     "libvpx/vp9/encoder/vp9_tokenize.c",
     "libvpx/vp9/encoder/vp9_treewriter.c",
     "libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_dct_ssse3.c",
     "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
+    "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
     "libvpx/vp9/vp9_cx_iface.c",
     "libvpx/vp9/vp9_dx_iface.c",
@@ -1983,6 +1427,7 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vpx_dsp/psnr.c",
     "libvpx/vpx_dsp/quantize.c",
     "libvpx/vpx_dsp/sad.c",
+    "libvpx/vpx_dsp/skin_detection.c",
     "libvpx/vpx_dsp/subtract.c",
     "libvpx/vpx_dsp/sum_squares.c",
     "libvpx/vpx_dsp/variance.c",
@@ -1991,10 +1436,20 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vpx_dsp/x86/avg_intrin_sse2.c",
     "libvpx/vpx_dsp/x86/avg_pred_sse2.c",
     "libvpx/vpx_dsp/x86/fwd_txfm_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c",
+    "libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c",
+    "libvpx/vpx_dsp/x86/highbd_variance_sse2.c",
     "libvpx/vpx_dsp/x86/inv_txfm_sse2.c",
     "libvpx/vpx_dsp/x86/inv_txfm_ssse3.c",
     "libvpx/vpx_dsp/x86/loopfilter_sse2.c",
     "libvpx/vpx_dsp/x86/quantize_sse2.c",
+    "libvpx/vpx_dsp/x86/quantize_ssse3.c",
     "libvpx/vpx_dsp/x86/sum_squares_sse2.c",
     "libvpx/vpx_dsp/x86/variance_sse2.c",
     "libvpx/vpx_dsp/x86/vpx_asm_stubs.c",
@@ -2006,6 +1461,7 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vpx_scale/generic/yv12extend.c",
     "libvpx/vpx_scale/vpx_scale_rtcd.c",
     "libvpx/vpx_util/vpx_thread.c",
+    "libvpx/vpx_util/vpx_write_yuv_frame.c",
     "config/x86_64/vpx_config.c",
 ]
 
@@ -2027,7 +1483,6 @@ libvpx_x86_64_asm_srcs = [
     "libvpx/vp8/encoder/x86/dct_sse2.asm",
     "libvpx/vp8/encoder/x86/encodeopt.asm",
     "libvpx/vp8/encoder/x86/fwalsh_sse2.asm",
-    "libvpx/vp8/encoder/x86/quantize_mmx.asm",
     "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
     "libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
     "libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
@@ -2035,10 +1490,14 @@ libvpx_x86_64_asm_srcs = [
     "libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm",
     "libvpx/vpx_dsp/x86/deblock_sse2.asm",
     "libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm",
+    "libvpx/vpx_dsp/x86/highbd_intrapred_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_sad4d_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_sad_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm",
+    "libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm",
     "libvpx/vpx_dsp/x86/intrapred_sse2.asm",
     "libvpx/vpx_dsp/x86/intrapred_ssse3.asm",
     "libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
-    "libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm",
     "libvpx/vpx_dsp/x86/sad4d_sse2.asm",
     "libvpx/vpx_dsp/x86/sad_sse2.asm",
     "libvpx/vpx_dsp/x86/sad_sse3.asm",
@@ -2047,6 +1506,8 @@ libvpx_x86_64_asm_srcs = [
     "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
     "libvpx/vpx_dsp/x86/subtract_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm",
+    "libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm",
+    "libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm",
     "libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm",
@@ -2086,7 +1547,7 @@ cc_library_static {
             local_include_dirs: ["config/arm"],
 
             neon: {
-                exclude_srcs: libvpx_arm_c_srcs,
+                exclude_srcs: libvpx_arm_neon_exclude_c_srcs,
                 srcs: libvpx_arm_neon_c_srcs,
                 generated_sources: ["libvpx_arm_neon_asm_srcs_converted"],
                 local_include_dirs: ["config/arm-neon"],
@@ -2103,13 +1564,13 @@ cc_library_static {
             local_include_dirs: ["config/mips32"],
 
             dspr2: {
-                exclude_srcs: libvpx_mips32_c_srcs,
+                exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs,
                 srcs: libvpx_mips32_dspr2_c_srcs,
                 local_include_dirs: ["config/mips32-dspr2"],
             },
 
             msa: {
-                exclude_srcs: libvpx_mips32_c_srcs,
+                exclude_srcs: libvpx_mips32_msa_exclude_c_srcs,
                 srcs: libvpx_mips32_msa_c_srcs,
                 local_include_dirs: ["config/mips32-msa"],
             },
@@ -2120,7 +1581,7 @@ cc_library_static {
             local_include_dirs: ["config/mips64"],
 
             msa: {
-                exclude_srcs: libvpx_mips64_c_srcs,
+                exclude_srcs: libvpx_mips64_msa_exclude_c_srcs,
                 srcs: libvpx_mips64_msa_c_srcs,
                 local_include_dirs: ["config/mips64-msa"],
             },
diff --git a/Android.bp.in b/Android.bp.in
index ac6a46435..0fb7c9581 100644
--- a/Android.bp.in
+++ b/Android.bp.in
@@ -15,6 +15,7 @@ gensrcs {
 
 cc_library_static {
     name: "libvpx",
+    vendor_available: true,
 
     arch: {
         arm: {
@@ -28,7 +29,7 @@ cc_library_static {
             local_include_dirs: ["config/arm"],
 
             neon: {
-                exclude_srcs: libvpx_arm_c_srcs,
+                exclude_srcs: libvpx_arm_neon_exclude_c_srcs,
                 srcs: libvpx_arm_neon_c_srcs,
                 generated_sources: ["libvpx_arm_neon_asm_srcs_converted"],
                 local_include_dirs: ["config/arm-neon"],
@@ -45,13 +46,13 @@ cc_library_static {
             local_include_dirs: ["config/mips32"],
 
             dspr2: {
-                exclude_srcs: libvpx_mips32_c_srcs,
+                exclude_srcs: libvpx_mips32_dspr2_exclude_c_srcs,
                 srcs: libvpx_mips32_dspr2_c_srcs,
                 local_include_dirs: ["config/mips32-dspr2"],
             },
 
             msa: {
-                exclude_srcs: libvpx_mips32_c_srcs,
+                exclude_srcs: libvpx_mips32_msa_exclude_c_srcs,
                 srcs: libvpx_mips32_msa_c_srcs,
                 local_include_dirs: ["config/mips32-msa"],
             },
@@ -62,7 +63,7 @@ cc_library_static {
             local_include_dirs: ["config/mips64"],
 
             msa: {
-                exclude_srcs: libvpx_mips64_c_srcs,
+                exclude_srcs: libvpx_mips64_msa_exclude_c_srcs,
                 srcs: libvpx_mips64_msa_c_srcs,
                 local_include_dirs: ["config/mips64-msa"],
             },
diff --git a/CleanSpec.mk b/CleanSpec.mk
new file mode 100644
index 000000000..cac3d3bc5
--- /dev/null
+++ b/CleanSpec.mk
@@ -0,0 +1,53 @@
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# If you don't need to do a full clean build but would like to touch
+# a file or delete some intermediate files, add a clean step to the end
+# of the list.  These steps will only be run once, if they haven't been
+# run before.
+#
+# E.g.:
+#     $(call add-clean-step, touch -c external/sqlite/sqlite3.h)
+#     $(call add-clean-step, rm -rf $(PRODUCT_OUT)/obj/STATIC_LIBRARIES/libz_intermediates)
+#
+# Always use "touch -c" and "rm -f" or "rm -rf" to gracefully deal with
+# files that are missing or have been moved.
+#
+# Use $(PRODUCT_OUT) to get to the "out/target/product/blah/" directory.
+# Use $(OUT_DIR) to refer to the "out" directory.
+#
+# If you need to re-do something that's already mentioned, just copy
+# the command and add it to the bottom of the list.  E.g., if a change
+# that you made last week required touching a file and a change you
+# made today requires touching the same file, just copy the old
+# touch step and add it to the end of the list.
+#
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# For example:
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/APPS/AndroidTests_intermediates)
+#$(call add-clean-step, rm -rf $(OUT_DIR)/target/common/obj/JAVA_LIBRARIES/core_intermediates)
+#$(call add-clean-step, find $(OUT_DIR) -type f -name "IGTalkSession*" -print0 | xargs -0 rm -f)
+#$(call add-clean-step, rm -rf $(PRODUCT_OUT)/data/*)
+
+# ************************************************
+# NEWER CLEAN STEPS MUST BE AT THE END OF THE LIST
+# ************************************************
+
+# vpx_config.asm change
+$(call add-clean-step, rm -rf $(OUT_DIR)/soong/.intermediates/external/libvpx/libvpx)
+
diff --git a/README.android b/README.android
index 92a84980b..92739cd7b 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
 Name: libvpx
 URL: http://www.webmproject.org
-Version: v1.6.1-665-gbcfd9c975
+Version: v1.7.0
 License: BSD
 License File: libvpx/LICENSE
 
-Date: Tuesday May 23 2017
-Branch: origin/master
-Commit: bcfd9c97508531a81cc2f5d393edb9eb1b00ce79
+Date: Wednesday January 24 2018
+Branch: origin/mandarinduck
+Commit: f80be22a1099b2a431c2796f529bb261064ec6b4
 
 Description:
 Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index 07913c812..c6c6a3724 100644
--- a/README.version
+++ b/README.version
@@ -1,4 +1,6 @@
-URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/bcfd9c97508531a81cc2f5d393edb9eb1b00ce79.tar.gz
-Version: v1.6.1-665-gbcfd9c975
+URL: https://chromium.googlesource.com/webm/libvpx.git/+archive/v1.7.0.tar.gz
+Version: v1.7.0
 BugComponent: 42195
 Owners: johannkoenig
+Local Modifications:
+  Add visibility="protected" attribute for global variables referenced in asm files.
diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h
index 3f112f6f7..4eb59c663 100644
--- a/config/arm-neon/vp8_rtcd.h
+++ b/config/arm-neon/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index 1df16205a..0f4f04d1f 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,9 +34,8 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
 
 void vp9_rtcd(void);
 
diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
index fdeb46a67..0a0b1d240 100644
--- a/config/arm-neon/vpx_config.asm
+++ b/config/arm-neon/vpx_config.asm
@@ -20,7 +20,9 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -74,10 +76,11 @@
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
-.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
diff --git a/config/arm-neon/vpx_config.c b/config/arm-neon/vpx_config.c
index 0eb0a305c..95e12998c 100644
--- a/config/arm-neon/vpx_config.c
+++ b/config/arm-neon/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h
index d632a2191..e9d645653 100644
--- a/config/arm-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index a915afabf..d911fd37f 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
 #define vpx_avg_8x8 vpx_avg_8x8_neon
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_neon
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_neon
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_neon
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_neon
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_neon
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_neon
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_c
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
 
 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_c
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
 
 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct4x4 vpx_fdct4x4_neon
 
 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint
 #define vpx_mse8x8 vpx_mse8x8_c
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_c
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_neon
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x16 vpx_sad16x16_neon
 
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
 void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x16x3 vpx_sad16x16x3_c
@@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref
 #define vpx_sad16x16x8 vpx_sad16x16x8_c
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad16x32 vpx_sad16x32_c
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_neon
 
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
 void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x8 vpx_sad16x8_neon
 
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
 void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x8x3 vpx_sad16x8x3_c
 
 void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
 void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x8x8 vpx_sad16x8x8_c
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x16 vpx_sad32x16_c
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_neon
 
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
 void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x32 vpx_sad32x32_neon
 
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x64 vpx_sad32x64_c
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_neon
 
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
 void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x4 vpx_sad4x4_neon
 
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
 void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x4x3 vpx_sad4x4x3_c
 
 void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
 void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x4x8 vpx_sad4x4x8_c
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad4x8 vpx_sad4x8_c
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
 
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad64x32 vpx_sad64x32_c
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_neon
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
 void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x64 vpx_sad64x64_neon
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_neon
 
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
 void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x16x3 vpx_sad8x16x3_c
 
 void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
 void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x16x8 vpx_sad8x16x8_c
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad8x4 vpx_sad8x4_c
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
 
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_neon
 
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
 void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x3 vpx_sad8x8x3_c
 
 void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_neon(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str
 #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
 
 uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
 
 uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
 
 uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
diff --git a/config/arm-neon/vpx_scale_rtcd.h b/config/arm-neon/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm-neon/vpx_scale_rtcd.h
+++ b/config/arm-neon/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/arm/vp8_rtcd.h b/config/arm/vp8_rtcd.h
index e089d058d..188b1d7a2 100644
--- a/config/arm/vp8_rtcd.h
+++ b/config/arm/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/arm/vp9_rtcd.h b/config/arm/vp9_rtcd.h
index 6d67ad8bc..8cb5870c0 100644
--- a/config/arm/vp9_rtcd.h
+++ b/config/arm/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,7 +34,7 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/arm/vpx_config.asm b/config/arm/vpx_config.asm
index 022dfa9b8..53f2e8535 100644
--- a/config/arm/vpx_config.asm
+++ b/config/arm/vpx_config.asm
@@ -20,7 +20,9 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -74,10 +76,11 @@
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
-.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
diff --git a/config/arm/vpx_config.c b/config/arm/vpx_config.c
index 7bc1805f6..1bc63e4f0 100644
--- a/config/arm/vpx_config.c
+++ b/config/arm/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=armv7-linux-gcc --disable-neon --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm/vpx_config.h b/config/arm/vpx_config.h
index ddd914e4b..039717798 100644
--- a/config/arm/vpx_config.h
+++ b/config/arm/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm/vpx_dsp_rtcd.h b/config/arm/vpx_dsp_rtcd.h
index 51b423f20..25ee2a9dd 100644
--- a/config/arm/vpx_dsp_rtcd.h
+++ b/config/arm/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_c
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_c
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_c
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_c
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_c
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_c
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/arm/vpx_scale_rtcd.h b/config/arm/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm/vpx_scale_rtcd.h
+++ b/config/arm/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/arm/vpx_version.h b/config/arm/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm/vpx_version.h
+++ b/config/arm/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h
index 3f112f6f7..4eb59c663 100644
--- a/config/arm64/vp8_rtcd.h
+++ b/config/arm64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index 1df16205a..0f4f04d1f 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,9 +34,8 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_neon
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
@@ -53,35 +53,62 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
-#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
-
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
+void vp9_scale_and_extend_frame_neon(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
+#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_neon
 
 void vp9_rtcd(void);
 
diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
index 7c69642cc..e3e1529bc 100644
--- a/config/arm64/vpx_config.asm
+++ b/config/arm64/vpx_config.asm
@@ -20,7 +20,9 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -74,10 +76,11 @@
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
-.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
diff --git a/config/arm64/vpx_config.c b/config/arm64/vpx_config.c
index ff9121723..13490c81c 100644
--- a/config/arm64/vpx_config.c
+++ b/config/arm64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h
index f1acc55cc..5304f8a57 100644
--- a/config/arm64/vpx_config.h
+++ b/config/arm64/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index a915afabf..d911fd37f 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,38 +30,39 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *, int p);
 #define vpx_avg_8x8 vpx_avg_8x8_neon
 
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
-#define vpx_comp_avg_pred vpx_comp_avg_pred_c
+void vpx_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
+#define vpx_comp_avg_pred vpx_comp_avg_pred_neon
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_neon
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_neon
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_neon
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_neon
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_neon
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_neon
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -213,26 +216,32 @@ void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_c
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16 vpx_fdct16x16_neon
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_neon
 
 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_c
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32 vpx_fdct32x32_neon
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_neon
 
 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_neon
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct4x4 vpx_fdct4x4_neon
 
 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4_1 vpx_fdct4x4_1_c
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_fdct4x4_1 vpx_fdct4x4_1_neon
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -273,17 +282,915 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_neon
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_neon
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_neon
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_neon
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_neon
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_neon
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_neon
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_neon
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_neon
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_neon
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_neon
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_neon
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_neon
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_neon
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_neon
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_neon
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_neon
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_neon
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_neon
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_neon
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_neon
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_neon
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_neon
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_neon
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_neon
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_neon
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_neon
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_neon
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_neon
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_fdct8x8_1_neon
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_neon
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_neon
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_neon
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_neon
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_neon
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_neon
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_neon
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_neon
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_neon
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_neon
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_neon
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_neon
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_neon
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_neon
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_neon
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_neon
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_neon
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_neon
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_neon
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_neon
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_neon
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_neon
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_neon
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_neon
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_neon
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_neon
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_neon
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_neon
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_neon
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_neon
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_neon
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_neon
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_neon
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_neon
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_neon
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon
@@ -416,17 +1323,20 @@ unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint
 #define vpx_mse8x8 vpx_mse8x8_c
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_c
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_neon
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x16 vpx_sad16x16_neon
 
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x16_avg vpx_sad16x16_avg_c
+unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
 void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x16x3 vpx_sad16x16x3_c
@@ -439,223 +1349,247 @@ void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref
 #define vpx_sad16x16x8 vpx_sad16x16x8_c
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad16x32 vpx_sad16x32_c
+unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad16x32 vpx_sad16x32_neon
 
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x32_avg vpx_sad16x32_avg_c
+unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
 void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x32x4d vpx_sad16x32x4d_c
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x8 vpx_sad16x8_neon
 
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad16x8_avg vpx_sad16x8_avg_c
+unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
 void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x8x3 vpx_sad16x8x3_c
 
 void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x4d vpx_sad16x8x4d_c
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
 void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad16x8x8 vpx_sad16x8x8_c
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x16 vpx_sad32x16_c
+unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x16 vpx_sad32x16_neon
 
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x16_avg vpx_sad32x16_avg_c
+unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
 void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x16x4d vpx_sad32x16x4d_c
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x32 vpx_sad32x32_neon
 
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x32_avg vpx_sad32x32_avg_c
-
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
+unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad32x64 vpx_sad32x64_c
+unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad32x64 vpx_sad32x64_neon
 
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad32x64_avg vpx_sad32x64_avg_c
+unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
 void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x64x4d vpx_sad32x64x4d_c
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x4 vpx_sad4x4_neon
 
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x4_avg vpx_sad4x4_avg_c
+unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
 void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x4x3 vpx_sad4x4x3_c
 
 void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x4d vpx_sad4x4x4d_c
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
 void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x4x8 vpx_sad4x4x8_c
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad4x8 vpx_sad4x8_c
+unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad4x8 vpx_sad4x8_neon
 
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad4x8_avg vpx_sad4x8_avg_c
+unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x4d vpx_sad4x8x4d_c
-
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad64x32 vpx_sad64x32_c
+unsigned int vpx_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad64x32 vpx_sad64x32_neon
 
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x32_avg vpx_sad64x32_avg_c
+unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
 void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x32x4d vpx_sad64x32x4d_c
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x64 vpx_sad64x64_neon
 
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad64x64_avg vpx_sad64x64_avg_c
-
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
+unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_neon
 
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x16_avg vpx_sad8x16_avg_c
+unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
 void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x16x3 vpx_sad8x16x3_c
 
 void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x4d vpx_sad8x16x4d_c
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
 void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x16x8 vpx_sad8x16x8_c
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_sad8x4 vpx_sad8x4_c
+unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad8x4 vpx_sad8x4_neon
 
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x4_avg vpx_sad8x4_avg_c
+unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x4d vpx_sad8x4x4d_c
-
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_neon
 
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_sad8x8_avg vpx_sad8x8_avg_c
+unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
 void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x3 vpx_sad8x8x3_c
 
 void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x4d vpx_sad8x8x4d_c
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_neon(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_neon
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
+uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c
+uint32_t vpx_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c
+uint32_t vpx_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c
+uint32_t vpx_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c
+uint32_t vpx_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c
+uint32_t vpx_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c
+uint32_t vpx_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c
+uint32_t vpx_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c
+uint32_t vpx_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c
+uint32_t vpx_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c
+uint32_t vpx_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c
+uint32_t vpx_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c
+uint32_t vpx_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
@@ -682,10 +1616,12 @@ uint32_t vpx_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int source_str
 #define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_neon
 
 uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c
+uint32_t vpx_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_neon
 
 uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c
+uint32_t vpx_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_neon
 
 uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
 uint32_t vpx_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
diff --git a/config/arm64/vpx_scale_rtcd.h b/config/arm64/vpx_scale_rtcd.h
index a1564b7ad..b37136827 100644
--- a/config/arm64/vpx_scale_rtcd.h
+++ b/config/arm64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h
index 1e0ff8a7e..bc3ebe8a1 100644
--- a/config/generic/vp8_rtcd.h
+++ b/config/generic/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h
index 7d0a9e2ba..45a371c2f 100644
--- a/config/generic/vp9_rtcd.h
+++ b/config/generic/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,7 +34,7 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
index 5d3ae3dfc..6b173b661 100644
--- a/config/generic/vpx_config.asm
+++ b/config/generic/vpx_config.asm
@@ -20,7 +20,9 @@
 .equ HAVE_SSE4_1 ,  0
 .equ HAVE_AVX ,  0
 .equ HAVE_AVX2 ,  0
+.equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
+.equ HAVE_MMI ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -74,10 +76,11 @@
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
-.equ CONFIG_VP9_HIGHBITDEPTH ,  0
+.equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
 .equ CONFIG_EXPERIMENTAL ,  0
 .equ CONFIG_SIZE_LIMIT ,  1
+.equ CONFIG_ALWAYS_ADJUST_BPM ,  0
 .equ CONFIG_SPATIAL_SVC ,  0
 .equ CONFIG_FP_MB_STATS ,  0
 .equ CONFIG_EMULATE_HARDWARE ,  0
diff --git a/config/generic/vpx_config.c b/config/generic/vpx_config.c
index c6d3e14c5..70fcdf7e3 100644
--- a/config/generic/vpx_config.c
+++ b/config/generic/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=generic-gnu --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h
index 63c75e4f9..dc96f8743 100644
--- a/config/generic/vpx_config.h
+++ b/config/generic/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
index ae0cea137..be38303bf 100644
--- a/config/generic/vpx_dsp_rtcd.h
+++ b/config/generic/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_c
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_c
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_c
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_c
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_c
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_c
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/generic/vpx_scale_rtcd.h b/config/generic/vpx_scale_rtcd.h
index f419cc7a5..d12f52764 100644
--- a/config/generic/vpx_scale_rtcd.h
+++ b/config/generic/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/mips32-dspr2/vp8_rtcd.h b/config/mips32-dspr2/vp8_rtcd.h
index a940d3594..e24387399 100644
--- a/config/mips32-dspr2/vp8_rtcd.h
+++ b/config/mips32-dspr2/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/mips32-dspr2/vp9_rtcd.h b/config/mips32-dspr2/vp9_rtcd.h
index 2e161c181..91d3a1aab 100644
--- a/config/mips32-dspr2/vp9_rtcd.h
+++ b/config/mips32-dspr2/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,7 +34,7 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,23 +52,50 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_dspr2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_dspr2
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_dspr2
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_dspr2
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
@@ -78,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips32-dspr2/vpx_config.c b/config/mips32-dspr2/vpx_config.c
index 1aa002457..0471682f6 100644
--- a/config/mips32-dspr2/vpx_config.c
+++ b/config/mips32-dspr2/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --enable-dspr2 --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32-dspr2/vpx_config.h b/config/mips32-dspr2/vpx_config.h
index 6df484f60..9bdc19616 100644
--- a/config/mips32-dspr2/vpx_config.h
+++ b/config/mips32-dspr2/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32-dspr2/vpx_dsp_rtcd.h b/config/mips32-dspr2/vpx_dsp_rtcd.h
index cdb0cfc6e..bd4acd0ff 100644
--- a/config/mips32-dspr2/vpx_dsp_rtcd.h
+++ b/config/mips32-dspr2/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,36 +30,36 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_dspr2
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_dspr2
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_dspr2
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_dspr2
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_dspr2
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_dspr2
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_dspr2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_dspr2
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -243,66 +245,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_dspr2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_dspr2
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_dspr2
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_dspr2
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_dspr2
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_dspr2
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_dspr2
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_dspr2
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_dspr2
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_dspr2
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_dspr2
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_dspr2
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_dspr2
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_dspr2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_dspr2
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_c
@@ -439,15 +1256,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
@@ -481,9 +1292,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
@@ -499,15 +1307,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
@@ -532,9 +1334,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
@@ -550,25 +1349,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32-dspr2/vpx_scale_rtcd.h b/config/mips32-dspr2/vpx_scale_rtcd.h
index 15b1b5a6f..487bc29b8 100644
--- a/config/mips32-dspr2/vpx_scale_rtcd.h
+++ b/config/mips32-dspr2/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -48,6 +49,9 @@ void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_dspr2(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_dspr2
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/mips32-dspr2/vpx_version.h b/config/mips32-dspr2/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32-dspr2/vpx_version.h
+++ b/config/mips32-dspr2/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/mips32-msa/vp8_rtcd.h b/config/mips32-msa/vp8_rtcd.h
index a851d7f13..00469b064 100644
--- a/config/mips32-msa/vp8_rtcd.h
+++ b/config/mips32-msa/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/mips32-msa/vp9_rtcd.h b/config/mips32-msa/vp9_rtcd.h
index d0adf351e..91d3a1aab 100644
--- a/config/mips32-msa/vp9_rtcd.h
+++ b/config/mips32-msa/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -31,10 +32,9 @@ extern "C" {
 #endif
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_msa
+#define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr
 #define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_msa
+#define vp9_fht16x16 vp9_fht16x16_c
 
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_msa
+#define vp9_fht4x4 vp9_fht4x4_c
 
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_msa
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
+#define vp9_fht8x8 vp9_fht8x8_c
 
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_msa
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
 
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
@@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips32-msa/vpx_config.c b/config/mips32-msa/vpx_config.c
index 21f7c3b82..737e5530b 100644
--- a/config/mips32-msa/vpx_config.c
+++ b/config/mips32-msa/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32-msa/vpx_config.h b/config/mips32-msa/vpx_config.h
index 53831030d..9ca17a018 100644
--- a/config/mips32-msa/vpx_config.h
+++ b/config/mips32-msa/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32-msa/vpx_dsp_rtcd.h b/config/mips32-msa/vpx_dsp_rtcd.h
index 22c63bfbc..4558d6960 100644
--- a/config/mips32-msa/vpx_dsp_rtcd.h
+++ b/config/mips32-msa/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_msa
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_msa
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_msa
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_msa
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_msa
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_msa
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_msa
+#define vpx_fdct16x16 vpx_fdct16x16_c
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
 
 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_msa
+#define vpx_fdct32x32 vpx_fdct32x32_c
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
 
 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_msa
+#define vpx_fdct4x4 vpx_fdct4x4_c
 
 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct4x4_1 vpx_fdct4x4_1_c
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_msa
+#define vpx_fdct8x8 vpx_fdct8x8_c
 
 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride);
@@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width);
@@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride
 #define vpx_int_pro_row vpx_int_pro_row_msa
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
 void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_msa
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_msa
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_msa
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_msa
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_msa
@@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_msa
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_msa
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_msa
@@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_msa
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_msa
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_msa
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_msa
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_msa
@@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_msa
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_msa
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_msa
@@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_msa
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_msa(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_msa
+int vpx_satd_c(const tran_low_t *coeff, int length);
+#define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_msa
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32-msa/vpx_scale_rtcd.h b/config/mips32-msa/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips32-msa/vpx_scale_rtcd.h
+++ b/config/mips32-msa/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/mips32-msa/vpx_version.h b/config/mips32-msa/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32-msa/vpx_version.h
+++ b/config/mips32-msa/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/mips32/vp8_rtcd.h b/config/mips32/vp8_rtcd.h
index 21dfa5a25..fbd444b8a 100644
--- a/config/mips32/vp8_rtcd.h
+++ b/config/mips32/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/mips32/vp9_rtcd.h b/config/mips32/vp9_rtcd.h
index c17a21721..91d3a1aab 100644
--- a/config/mips32/vp9_rtcd.h
+++ b/config/mips32/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,7 +34,7 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips32/vpx_config.c b/config/mips32/vpx_config.c
index e2703b374..f66993f87 100644
--- a/config/mips32/vpx_config.c
+++ b/config/mips32/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips32-linux-gcc --disable-dspr2 --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips32/vpx_config.h b/config/mips32/vpx_config.h
index beaa2f86c..d3ecfe8e7 100644
--- a/config/mips32/vpx_config.h
+++ b/config/mips32/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips32/vpx_dsp_rtcd.h b/config/mips32/vpx_dsp_rtcd.h
index 1b15aadba..fbb38953d 100644
--- a/config/mips32/vpx_dsp_rtcd.h
+++ b/config/mips32/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_c
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_c
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_c
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_c
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_c
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_c
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips32/vpx_scale_rtcd.h b/config/mips32/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips32/vpx_scale_rtcd.h
+++ b/config/mips32/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/mips32/vpx_version.h b/config/mips32/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips32/vpx_version.h
+++ b/config/mips32/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/mips64-msa/vp8_rtcd.h b/config/mips64-msa/vp8_rtcd.h
index a851d7f13..00469b064 100644
--- a/config/mips64-msa/vp8_rtcd.h
+++ b/config/mips64-msa/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/mips64-msa/vp9_rtcd.h b/config/mips64-msa/vp9_rtcd.h
index d0adf351e..91d3a1aab 100644
--- a/config/mips64-msa/vp9_rtcd.h
+++ b/config/mips64-msa/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -31,10 +32,9 @@ extern "C" {
 #endif
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-int64_t vp9_block_error_msa(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_msa
+#define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -44,35 +44,58 @@ void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr
 #define vp9_fdct8x8_quant vp9_fdct8x8_quant_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht16x16_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_msa
+#define vp9_fht16x16 vp9_fht16x16_c
 
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht4x4_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_msa
+#define vp9_fht4x4 vp9_fht4x4_c
 
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-void vp9_fht8x8_msa(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_msa
-
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
+#define vp9_fht8x8 vp9_fht8x8_c
 
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vp9_fwht4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vp9_fwht4x4 vp9_fwht4x4_msa
+#define vp9_fwht4x4 vp9_fwht4x4_c
+
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
 
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-void vp9_iht16x16_256_add_msa(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
-#define vp9_iht16x16_256_add vp9_iht16x16_256_add_msa
+#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
 void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht4x4_16_add vp9_iht4x4_16_add_msa
+#define vp9_iht4x4_16_add vp9_iht4x4_16_add_c
 
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-void vp9_iht8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
-#define vp9_iht8x8_64_add vp9_iht8x8_64_add_msa
+#define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
@@ -83,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips64-msa/vpx_config.c b/config/mips64-msa/vpx_config.c
index c1ab4fb90..9665244e3 100644
--- a/config/mips64-msa/vpx_config.c
+++ b/config/mips64-msa/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips64-linux-gcc --enable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips64-msa/vpx_config.h b/config/mips64-msa/vpx_config.h
index ea5e8bd43..df9ed4455 100644
--- a/config/mips64-msa/vpx_config.h
+++ b/config/mips64-msa/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips64-msa/vpx_dsp_rtcd.h b/config/mips64-msa/vpx_dsp_rtcd.h
index 22c63bfbc..4558d6960 100644
--- a/config/mips64-msa/vpx_dsp_rtcd.h
+++ b/config/mips64-msa/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -30,36 +32,36 @@ unsigned int vpx_avg_8x8_msa(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_msa
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_msa
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_msa
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_msa
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_msa
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_msa
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_msa
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_msa
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -205,35 +207,28 @@ void vpx_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_
 #define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_msa
 
 void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16 vpx_fdct16x16_msa
+#define vpx_fdct16x16 vpx_fdct16x16_c
 
 void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct16x16_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct16x16_1 vpx_fdct16x16_1_msa
+#define vpx_fdct16x16_1 vpx_fdct16x16_1_c
 
 void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32 vpx_fdct32x32_msa
+#define vpx_fdct32x32 vpx_fdct32x32_c
 
 void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_1_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_1 vpx_fdct32x32_1_msa
+#define vpx_fdct32x32_1 vpx_fdct32x32_1_c
 
 void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct32x32_rd_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct32x32_rd vpx_fdct32x32_rd_msa
+#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c
 
 void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct4x4_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct4x4 vpx_fdct4x4_msa
+#define vpx_fdct4x4 vpx_fdct4x4_c
 
 void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride);
 #define vpx_fdct4x4_1 vpx_fdct4x4_1_c
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_msa(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_msa
+#define vpx_fdct8x8 vpx_fdct8x8_c
 
 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *output, int stride);
@@ -271,68 +266,881 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_msa
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_16x16 vpx_hadamard_16x16_msa
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_msa(const int16_t *src_diff, int src_stride, int16_t *coeff);
-#define vpx_hadamard_8x8 vpx_hadamard_8x8_msa
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_10_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_10_add vpx_idct16x16_10_add_msa
+#define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_1_add vpx_idct16x16_1_add_msa
+#define vpx_idct16x16_1_add vpx_idct16x16_1_add_c
 
 void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_256_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_256_add vpx_idct16x16_256_add_c
 
 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_msa
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_c
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_msa
+#define vpx_idct32x32_135_add vpx_idct32x32_135_add_c
 
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1_add vpx_idct32x32_1_add_msa
+#define vpx_idct32x32_1_add vpx_idct32x32_1_add_c
 
 void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_34_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_34_add vpx_idct32x32_34_add_msa
+#define vpx_idct32x32_34_add vpx_idct32x32_34_add_c
 
 void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_16_add vpx_idct4x4_16_add_msa
+#define vpx_idct4x4_16_add vpx_idct4x4_16_add_c
 
 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct4x4_1_add vpx_idct4x4_1_add_msa
+#define vpx_idct4x4_1_add vpx_idct4x4_1_add_c
 
 void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_12_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_12_add vpx_idct8x8_12_add_msa
+#define vpx_idct8x8_12_add vpx_idct8x8_12_add_c
 
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_1_add vpx_idct8x8_1_add_msa
+#define vpx_idct8x8_1_add vpx_idct8x8_1_add_c
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_msa
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_c
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width);
@@ -343,12 +1151,10 @@ void vpx_int_pro_row_msa(int16_t *hbuf, const uint8_t *ref, const int ref_stride
 #define vpx_int_pro_row vpx_int_pro_row_msa
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_16_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_msa
+#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c
 
 void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_iwht4x4_1_add_msa(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_msa
+#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c
 
 void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
 void vpx_lpf_horizontal_16_msa(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh);
@@ -496,18 +1302,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_msa
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_msa
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_msa
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_msa
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_msa
@@ -552,10 +1350,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad4x8x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_msa
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_msa
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_msa
@@ -576,18 +1370,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_msa
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x3_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_msa
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_msa
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_msa
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_msa
@@ -620,10 +1406,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x4x4d_msa(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_msa
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_msa
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_msa
@@ -644,26 +1426,26 @@ void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_p
 void vpx_sad8x8x8_msa(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_msa
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_msa(const int16_t *coeff, int length);
-#define vpx_satd vpx_satd_msa
+int vpx_satd_c(const tran_low_t *coeff, int length);
+#define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-#define vpx_scaled_2d vpx_scaled_2d_c
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+#define vpx_scaled_2d vpx_scaled_2d_msa
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips64-msa/vpx_scale_rtcd.h b/config/mips64-msa/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips64-msa/vpx_scale_rtcd.h
+++ b/config/mips64-msa/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/mips64-msa/vpx_version.h b/config/mips64-msa/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips64-msa/vpx_version.h
+++ b/config/mips64-msa/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/mips64/vp8_rtcd.h b/config/mips64/vp8_rtcd.h
index 21dfa5a25..fbd444b8a 100644
--- a/config/mips64/vp8_rtcd.h
+++ b/config/mips64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/mips64/vp9_rtcd.h b/config/mips64/vp9_rtcd.h
index c17a21721..91d3a1aab 100644
--- a/config/mips64/vp9_rtcd.h
+++ b/config/mips64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -33,7 +34,7 @@ extern "C" {
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_c
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
@@ -51,12 +52,42 @@ void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_c
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sad_c
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_c
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c
 
@@ -75,9 +106,6 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #include "vpx_config.h"
diff --git a/config/mips64/vpx_config.c b/config/mips64/vpx_config.c
index f7ec4e6d8..5eb7dff03 100644
--- a/config/mips64/vpx_config.c
+++ b/config/mips64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=mips64-linux-gcc --disable-msa --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/mips64/vpx_config.h b/config/mips64/vpx_config.h
index 9efd808ed..8e67ca3a7 100644
--- a/config/mips64/vpx_config.h
+++ b/config/mips64/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/mips64/vpx_dsp_rtcd.h b/config/mips64/vpx_dsp_rtcd.h
index 1b15aadba..fbb38953d 100644
--- a/config/mips64/vpx_dsp_rtcd.h
+++ b/config/mips64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -28,28 +30,28 @@ unsigned int vpx_avg_8x8_c(const uint8_t *, int p);
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_c
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_c
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_c
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_c
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_c
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_c
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_c
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_c
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_c
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -229,15 +231,843 @@ void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_c
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_c
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_c
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_c
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_c
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_c
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_c
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_c
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_c
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_c
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_c
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_c
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_c
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_c
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_c
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_c
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_c
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_c
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_c
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_c
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_c
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_c
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_c
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_c
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_c
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_c
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_c
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_c
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_c
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_c
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_c
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_c
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_c
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_c
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_c
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_c
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_c
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_c
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_c
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_c
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_c
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_c
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_c
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_c
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_c
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_c
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_c
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_c
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_c
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_c
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_c
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_c
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_c
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c
 
@@ -400,15 +1230,9 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
@@ -442,9 +1266,6 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_c
 
@@ -460,15 +1281,9 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_c
 
@@ -493,9 +1308,6 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_c
 
@@ -511,25 +1323,25 @@ void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_c
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/mips64/vpx_scale_rtcd.h b/config/mips64/vpx_scale_rtcd.h
index ea70efc9d..eb6c009e1 100644
--- a/config/mips64/vpx_scale_rtcd.h
+++ b/config/mips64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/mips64/vpx_version.h b/config/mips64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/mips64/vpx_version.h
+++ b/config/mips64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
index 77479a23b..3afbea668 100644
--- a/config/x86/vp8_rtcd.h
+++ b/config/x86/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index b19d6cc5a..49e9885aa 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_sse2
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_sse2
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
 
@@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_sse2
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sadx3
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_sse2
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
@@ -89,9 +119,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
 void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
index e9ff1c42f..d9848bace 100644
--- a/config/x86/vpx_config.asm
+++ b/config/x86/vpx_config.asm
@@ -17,7 +17,9 @@
 %define HAVE_SSE4_1 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
+%define HAVE_AVX512 0
 %define HAVE_VSX 0
+%define HAVE_MMI 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define HAVE_UNISTD_H 1
@@ -71,10 +73,11 @@
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_VP9_HIGHBITDEPTH 1
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
+%define CONFIG_ALWAYS_ADJUST_BPM 0
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86/vpx_config.c b/config/x86/vpx_config.c
index 77a386493..2d3f0f735 100644
--- a/config/x86/vpx_config.c
+++ b/config/x86/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=x86-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h
index 11a5e94ad..5b0fed08f 100644
--- a/config/x86/vpx_config.h
+++ b/config/x86/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index adc43df7b..69f2b43b5 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
 void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_ssse3
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_ssse3
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_ssse3
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_sse2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -294,17 +296,1068 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_sse2
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_c
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_c
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_c
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_c
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_c
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_c
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -318,16 +1371,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
 
 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3
 
@@ -359,8 +1411,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -463,10 +1514,12 @@ void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b vpx_quantize_b_sse2
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b vpx_quantize_b_ssse3
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -540,16 +1593,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_sse2
@@ -593,9 +1640,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_sse2
@@ -616,16 +1660,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_sse2
@@ -657,9 +1695,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_sse2
@@ -679,27 +1714,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_ssse3
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86/vpx_scale_rtcd.h b/config/x86/vpx_scale_rtcd.h
index ddf7d01cc..5f09104ea 100644
--- a/config/x86/vpx_scale_rtcd.h
+++ b/config/x86/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
index 77479a23b..3afbea668 100644
--- a/config/x86_64/vp8_rtcd.h
+++ b/config/x86_64/vp8_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP8_RTCD_H_
 #define VP8_RTCD_H_
 
diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h
index f77b2a5c2..2a13e5d5c 100644
--- a/config/x86_64/vp9_rtcd.h
+++ b/config/x86_64/vp9_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VP9_RTCD_H_
 #define VP9_RTCD_H_
 
@@ -34,15 +35,14 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in
 int64_t vp9_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
 #define vp9_block_error vp9_block_error_sse2
 
-int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
-int64_t vp9_block_error_fp_sse2(const int16_t *coeff, const int16_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_sse2
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_fdct8x8_quant vp9_fdct8x8_quant_ssse3
 
@@ -58,14 +58,44 @@ void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_t
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_fht8x8 vp9_fht8x8_sse2
 
-int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-int vp9_full_search_sadx3(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv);
-#define vp9_full_search_sad vp9_full_search_sadx3
-
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 void vp9_fwht4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_sse2
 
+int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+int64_t vp9_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_sse2
+
+void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+
+void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+
+void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+
+void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
+#define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
+
+void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd);
+#define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c
+
+void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c
+
+void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
+#define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
+
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+
+void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
+#define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
+
 void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type);
 #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2
@@ -91,9 +121,6 @@ void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct y
 void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 #define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_ssse3
 
-void vp9_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count);
-#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c
-
 void vp9_rtcd(void);
 
 #ifdef RTCD_C
diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
index 798a81a1c..747981e62 100644
--- a/config/x86_64/vpx_config.asm
+++ b/config/x86_64/vpx_config.asm
@@ -17,7 +17,9 @@
 %define HAVE_SSE4_1 0
 %define HAVE_AVX 0
 %define HAVE_AVX2 0
+%define HAVE_AVX512 0
 %define HAVE_VSX 0
+%define HAVE_MMI 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define HAVE_UNISTD_H 1
@@ -71,10 +73,11 @@
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-%define CONFIG_VP9_HIGHBITDEPTH 0
+%define CONFIG_VP9_HIGHBITDEPTH 1
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
 %define CONFIG_EXPERIMENTAL 0
 %define CONFIG_SIZE_LIMIT 1
+%define CONFIG_ALWAYS_ADJUST_BPM 0
 %define CONFIG_SPATIAL_SVC 0
 %define CONFIG_FP_MB_STATS 0
 %define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86_64/vpx_config.c b/config/x86_64/vpx_config.c
index 9aa0640aa..a13a1d2e2 100644
--- a/config/x86_64/vpx_config.c
+++ b/config/x86_64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072";
+static const char* const cfg = "--target=x86_64-linux-gcc --disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h
index d24e047ba..75d7e9900 100644
--- a/config/x86_64/vpx_config.h
+++ b/config/x86_64/vpx_config.h
@@ -29,7 +29,9 @@
 #define HAVE_SSE4_1 0
 #define HAVE_AVX 0
 #define HAVE_AVX2 0
+#define HAVE_AVX512 0
 #define HAVE_VSX 0
+#define HAVE_MMI 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
@@ -83,10 +85,11 @@
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
-#define CONFIG_VP9_HIGHBITDEPTH 0
+#define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
 #define CONFIG_EXPERIMENTAL 0
 #define CONFIG_SIZE_LIMIT 1
+#define CONFIG_ALWAYS_ADJUST_BPM 0
 #define CONFIG_SPATIAL_SVC 0
 #define CONFIG_FP_MB_STATS 0
 #define CONFIG_EMULATE_HARDWARE 0
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 4e55439ca..a382a5a05 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_DSP_RTCD_H_
 #define VPX_DSP_RTCD_H_
 
@@ -13,6 +14,7 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 
 #ifdef __cplusplus
@@ -31,42 +33,42 @@ void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int
 void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride);
 #define vpx_comp_avg_pred vpx_comp_avg_pred_sse2
 
-void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8 vpx_convolve8_ssse3
 
-void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg vpx_convolve8_avg_ssse3
 
-void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_ssse3
 
-void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_ssse3
 
-void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_horiz vpx_convolve8_horiz_ssse3
 
-void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve8_vert vpx_convolve8_vert_ssse3
 
-void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_avg vpx_convolve_avg_sse2
 
-void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_convolve_copy vpx_convolve_copy_sse2
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
@@ -257,8 +259,7 @@ void vpx_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride);
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
-void vpx_fdct8x8_ssse3(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_fdct8x8 vpx_fdct8x8_ssse3
+#define vpx_fdct8x8 vpx_fdct8x8_sse2
 
 void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_sse2(const int16_t *input, tran_low_t *output, int stride);
@@ -295,18 +296,1075 @@ void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *abov
 void vpx_h_predictor_8x8_sse2(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_h_predictor_8x8 vpx_h_predictor_8x8_sse2
 
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_16x16_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_sse2
 
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_sse2(const int16_t *src_diff, int src_stride, int16_t *coeff);
-void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, int src_stride, int16_t *coeff);
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+void vpx_hadamard_8x8_ssse3(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 #define vpx_hadamard_8x8 vpx_hadamard_8x8_ssse3
 
 void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left);
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
+void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+
+void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+
+unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_sse2
+
+unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+
+unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+
+unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_sse2
+
+unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_sse2
+
+unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_sse2
+
+unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_sse2
+
+unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_sse2
+
+unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_sse2
+
+unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+
+unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+
+unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_sse2
+
+unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_sse2
+
+unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_sse2
+
+unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+
+unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_10_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_sse2
+
+void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+
+void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+
+unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_sse2
+
+unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+
+unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+
+unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_sse2
+
+unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_sse2
+
+unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_sse2
+
+unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_sse2
+
+unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_sse2
+
+unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_sse2
+
+unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+
+unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+
+unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_sse2
+
+unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_sse2
+
+unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_sse2
+
+unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+
+unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_12_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_sse2
+
+void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+
+void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+
+unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_sse2
+
+unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+
+unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+
+unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_sse2
+
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_sse2
+
+unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_sse2
+
+unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_sse2
+
+unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance16x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_sse2
+
+unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_sse2
+
+unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_sse2
+
+unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance32x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_sse2
+
+unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+
+unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+
+unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x32_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_sse2
+
+unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance64x64_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_sse2
+
+unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x16_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_sse2
+
+unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+
+unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+unsigned int vpx_highbd_8_variance8x8_sse2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_sse2
+
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+
+void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8 vpx_highbd_convolve8_sse2
+
+void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg vpx_highbd_convolve8_avg_sse2
+
+void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_horiz vpx_highbd_convolve8_avg_horiz_sse2
+
+void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_avg_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_avg_vert vpx_highbd_convolve8_avg_vert_sse2
+
+void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_horiz_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_horiz vpx_highbd_convolve8_horiz_sse2
+
+void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve8_vert_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve8_vert vpx_highbd_convolve8_vert_sse2
+
+void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_avg_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_avg vpx_highbd_convolve_avg_sse2
+
+void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+void vpx_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps);
+#define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_sse2
+
+void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_ssse3
+
+void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_ssse3
+
+void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_sse2
+
+void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_ssse3
+
+void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_16x16 vpx_highbd_d135_predictor_16x16_ssse3
+
+void vpx_highbd_d135_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_32x32 vpx_highbd_d135_predictor_32x32_ssse3
+
+void vpx_highbd_d135_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_4x4 vpx_highbd_d135_predictor_4x4_sse2
+
+void vpx_highbd_d135_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_ssse3
+
+void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_ssse3
+
+void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_ssse3
+
+void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_sse2
+
+void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_ssse3
+
+void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_ssse3
+
+void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_ssse3
+
+void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_sse2
+
+void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_ssse3
+
+void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_16x16 vpx_highbd_d45_predictor_16x16_ssse3
+
+void vpx_highbd_d45_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_32x32 vpx_highbd_d45_predictor_32x32_ssse3
+
+void vpx_highbd_d45_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_4x4 vpx_highbd_d45_predictor_4x4_ssse3
+
+void vpx_highbd_d45_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_ssse3
+
+void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_ssse3
+
+void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_ssse3
+
+void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_sse2
+
+void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_ssse3
+
+void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_16x16 vpx_highbd_dc_128_predictor_16x16_sse2
+
+void vpx_highbd_dc_128_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_32x32 vpx_highbd_dc_128_predictor_32x32_sse2
+
+void vpx_highbd_dc_128_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_4x4 vpx_highbd_dc_128_predictor_4x4_sse2
+
+void vpx_highbd_dc_128_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_128_predictor_8x8 vpx_highbd_dc_128_predictor_8x8_sse2
+
+void vpx_highbd_dc_left_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_16x16 vpx_highbd_dc_left_predictor_16x16_sse2
+
+void vpx_highbd_dc_left_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_32x32 vpx_highbd_dc_left_predictor_32x32_sse2
+
+void vpx_highbd_dc_left_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_4x4 vpx_highbd_dc_left_predictor_4x4_sse2
+
+void vpx_highbd_dc_left_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_left_predictor_8x8 vpx_highbd_dc_left_predictor_8x8_sse2
+
+void vpx_highbd_dc_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_16x16 vpx_highbd_dc_predictor_16x16_sse2
+
+void vpx_highbd_dc_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_32x32 vpx_highbd_dc_predictor_32x32_sse2
+
+void vpx_highbd_dc_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_4x4 vpx_highbd_dc_predictor_4x4_sse2
+
+void vpx_highbd_dc_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_predictor_8x8 vpx_highbd_dc_predictor_8x8_sse2
+
+void vpx_highbd_dc_top_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_16x16 vpx_highbd_dc_top_predictor_16x16_sse2
+
+void vpx_highbd_dc_top_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_32x32 vpx_highbd_dc_top_predictor_32x32_sse2
+
+void vpx_highbd_dc_top_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_4x4 vpx_highbd_dc_top_predictor_4x4_sse2
+
+void vpx_highbd_dc_top_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_sse2
+
+void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct16x16_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_sse2
+
+void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+
+void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_sse2
+
+void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+
+void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct32x32_rd_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_sse2
+
+void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct4x4_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_sse2
+
+void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
+void vpx_highbd_fdct8x8_sse2(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_sse2
+
+void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8_1 vpx_highbd_fdct8x8_1_c
+
+void vpx_highbd_h_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_16x16 vpx_highbd_h_predictor_16x16_sse2
+
+void vpx_highbd_h_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_32x32 vpx_highbd_h_predictor_32x32_sse2
+
+void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_4x4 vpx_highbd_h_predictor_4x4_sse2
+
+void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_sse2
+
+void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2
+
+void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_sse2
+
+void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2
+
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct16x16_38_add vpx_highbd_idct16x16_38_add_sse2
+
+void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_sse2
+
+void vpx_highbd_idct32x32_135_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_135_add vpx_highbd_idct32x32_135_add_sse2
+
+void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2
+
+void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_sse2
+
+void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2
+
+void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_sse2
+
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2
+
+void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_sse2
+
+void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2
+
+void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c
+
+void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
+#define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c
+
+void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16 vpx_highbd_lpf_horizontal_16_sse2
+
+void vpx_highbd_lpf_horizontal_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_16_dual vpx_highbd_lpf_horizontal_16_dual_sse2
+
+void vpx_highbd_lpf_horizontal_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_4 vpx_highbd_lpf_horizontal_4_sse2
+
+void vpx_highbd_lpf_horizontal_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_4_dual vpx_highbd_lpf_horizontal_4_dual_sse2
+
+void vpx_highbd_lpf_horizontal_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_horizontal_8 vpx_highbd_lpf_horizontal_8_sse2
+
+void vpx_highbd_lpf_horizontal_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_horizontal_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_horizontal_8_dual vpx_highbd_lpf_horizontal_8_dual_sse2
+
+void vpx_highbd_lpf_vertical_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16 vpx_highbd_lpf_vertical_16_sse2
+
+void vpx_highbd_lpf_vertical_16_dual_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_16_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_16_dual vpx_highbd_lpf_vertical_16_dual_sse2
+
+void vpx_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_4_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_4 vpx_highbd_lpf_vertical_4_sse2
+
+void vpx_highbd_lpf_vertical_4_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_4_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_4_dual vpx_highbd_lpf_vertical_4_dual_sse2
+
+void vpx_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+void vpx_highbd_lpf_vertical_8_sse2(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd);
+#define vpx_highbd_lpf_vertical_8 vpx_highbd_lpf_vertical_8_sse2
+
+void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd);
+#define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_sse2
+
+void vpx_highbd_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
+
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
+
+unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_sse2
+
+unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
+
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
+
+unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_sse2
+
+unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
+
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
+
+unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_sse2
+
+unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
+
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
+
+unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_sse2
+
+unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
+
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
+
+unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_sse2
+
+unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
+
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
+
+unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_sse2
+
+unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
+
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
+
+unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+
+unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
+
+unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+
+unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
+
+unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_sse2
+
+unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
+
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
+
+unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_sse2
+
+unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
+
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
+
+unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_sse2
+
+unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
+
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
+
+unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_sse2
+
+unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
+
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
+
+unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_sse2
+
+unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
+
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
+
+void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+
+void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_16x16 vpx_highbd_tm_predictor_16x16_sse2
+
+void vpx_highbd_tm_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_32x32 vpx_highbd_tm_predictor_32x32_sse2
+
+void vpx_highbd_tm_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_4x4 vpx_highbd_tm_predictor_4x4_sse2
+
+void vpx_highbd_tm_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_tm_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_tm_predictor_8x8 vpx_highbd_tm_predictor_8x8_sse2
+
+void vpx_highbd_v_predictor_16x16_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_16x16 vpx_highbd_v_predictor_16x16_sse2
+
+void vpx_highbd_v_predictor_32x32_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_32x32 vpx_highbd_v_predictor_32x32_sse2
+
+void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_4x4 vpx_highbd_v_predictor_4x4_sse2
+
+void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2
+
 void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2
@@ -320,16 +1378,15 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stri
 #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2
 
 void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct16x16_38_add vpx_idct16x16_256_add_sse2
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+#define vpx_idct16x16_38_add vpx_idct16x16_38_add_sse2
 
 void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_ssse3
+#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_sse2
 
 void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
 #define vpx_idct32x32_135_add vpx_idct32x32_135_add_ssse3
 
@@ -361,8 +1418,7 @@ void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
 
 void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride);
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride);
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride);
-#define vpx_idct8x8_64_add vpx_idct8x8_64_add_ssse3
+#define vpx_idct8x8_64_add vpx_idct8x8_64_add_sse2
 
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
@@ -544,16 +1600,10 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x3 vpx_sad32x32x3_c
-
 void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_sse2
@@ -597,9 +1647,6 @@ void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
-void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x8x8 vpx_sad4x8x8_c
-
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad64x32 vpx_sad64x32_sse2
@@ -620,16 +1667,10 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x3 vpx_sad64x64x3_c
-
 void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
-void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad64x64x8 vpx_sad64x64x8_c
-
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x16 vpx_sad8x16_sse2
@@ -661,9 +1702,6 @@ void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * con
 void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
-void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x4x8 vpx_sad8x4x8_c
-
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x8 vpx_sad8x8_sse2
@@ -683,27 +1721,27 @@ void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *
 void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
 #define vpx_sad8x8x8 vpx_sad8x8x8_c
 
-int vpx_satd_c(const int16_t *coeff, int length);
-int vpx_satd_sse2(const int16_t *coeff, int length);
+int vpx_satd_c(const tran_low_t *coeff, int length);
+int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
 
-void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_2d vpx_scaled_2d_ssse3
 
-void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_2d vpx_scaled_avg_2d_c
 
-void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c
 
-void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_avg_vert vpx_scaled_avg_vert_c
 
-void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_horiz vpx_scaled_horiz_c
 
-void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h);
+void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86_64/vpx_scale_rtcd.h b/config/x86_64/vpx_scale_rtcd.h
index ddf7d01cc..5f09104ea 100644
--- a/config/x86_64/vpx_scale_rtcd.h
+++ b/config/x86_64/vpx_scale_rtcd.h
@@ -1,3 +1,4 @@
+// This file is generated. Do not edit.
 #ifndef VPX_SCALE_RTCD_H_
 #define VPX_SCALE_RTCD_H_
 
@@ -46,6 +47,9 @@ void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf);
 void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf);
 #define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c
 
+void vpx_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
+#define vpx_yv12_copy_frame vpx_yv12_copy_frame_c
+
 void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc);
 #define vpx_yv12_copy_y vpx_yv12_copy_y_c
 
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index 24da169b4..6078bae90 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,7 +1,8 @@
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  6
-#define VERSION_PATCH  1
+#define VERSION_MINOR  7
+#define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.6.1"
-#define VERSION_STRING      " v1.6.1"
+#define VERSION_STRING_NOSP "v1.7.0"
+#define VERSION_STRING      " v1.7.0"
diff --git a/generate_config.sh b/generate_config.sh
index 97e5a0026..351feb87c 100755
--- a/generate_config.sh
+++ b/generate_config.sh
@@ -139,23 +139,76 @@ function gen_config_files {
   rm -rf vpx_config.* vpx_version.h
 }
 
+# Generate a text file containing sources for a config
+# $1 - Config
+function gen_source_list {
+  make_clean
+  if [[ "$1" = "mips"* ]] || [[ "$1" = "generic" ]]; then
+    config=$(print_config_basic $1)
+  else
+    config=$(print_config $1)
+  fi
+  make libvpx_srcs.txt target=libs $config > /dev/null
+  mv libvpx_srcs.txt libvpx_srcs_$1.txt
+}
+
+# Extract a list of C sources from a libvpx_srcs.txt file
+# $1 - path to libvpx_srcs.txt
+function libvpx_srcs_txt_to_c_srcs {
+    grep ".c$" $1 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","' | sort
+}
+
+# Extract a list of ASM sources from a libvpx_srcs.txt file
+# $1 - path to libvpx_srcs.txt
+function libvpx_srcs_txt_to_asm_srcs {
+    grep ".asm$" $1 | awk '$0="\"libvpx/"$0"\","' | sort
+}
+
 # Convert a list of sources to a blueprint file containing a variable
 # assignment.
-# $1 - Variable name prefix.
-# $2 - Input file.
-# $3 - Config directory.
+# $1 - Config
 function gen_bp_srcs {
-  echo "$1_c_srcs = ["
-  grep ".c$" $2 | grep -v "^vpx_config.c$" | awk '$0="\"libvpx/"$0"\","'
-  echo "\"$3/vpx_config.c\","
-  echo "]"
-  if grep -q ".asm$" $2; then
+  (
+    varprefix=libvpx_${1//-/_}
+    echo "${varprefix}_c_srcs = ["
+    libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt
+    echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\","
+    echo "]"
+    if grep -q ".asm$" libvpx_srcs_$1.txt; then
+      echo
+      echo "${varprefix}_asm_srcs = ["
+      libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt
+      echo "]"
+    fi
     echo
-    echo "$1_asm_srcs = ["
-    grep ".asm$" $2 | awk '$0="\"libvpx/"$0"\","'
+  ) > config_$1.bp
+}
+
+# Convert a list of sources to a blueprint file containing a variable
+# assignment, relative to a reference config.
+# $1 - Config
+# $2 - Reference config
+function gen_bp_srcs_with_excludes {
+  (
+    varprefix=libvpx_${1//-/_}
+    echo "${varprefix}_c_srcs = ["
+    comm -23 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt)
+    echo "\"$LIBVPX_CONFIG_DIR/$1/vpx_config.c\","
     echo "]"
-  fi
-  echo
+    echo
+    echo "${varprefix}_exclude_c_srcs = ["
+    comm -13 <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$1.txt) <(libvpx_srcs_txt_to_c_srcs libvpx_srcs_$2.txt)
+    echo "\"$LIBVPX_CONFIG_DIR/$2/vpx_config.c\","
+    echo "]"
+    echo
+    if grep -q ".asm$" libvpx_srcs_$1.txt; then
+      echo
+      echo "${varprefix}_asm_srcs = ["
+      libvpx_srcs_txt_to_asm_srcs libvpx_srcs_$1.txt
+      echo "]"
+    fi
+    echo
+  ) > config_$1.bp
 }
 
 echo "Create temporary directory."
@@ -165,8 +218,10 @@ cp -R $LIBVPX_SRC_DIR $TEMP_DIR
 cd $TEMP_DIR
 
 echo "Generate config files."
-all_platforms="--enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072"
-intel="--disable-sse4_1 --disable-avx --disable-avx2 --as=yasm"
+all_platforms="--enable-external-build --enable-realtime-only --enable-pic"
+all_platforms+=" --disable-runtime-cpu-detect --disable-install-docs"
+all_platforms+=" --size-limit=4096x3072 --enable-vp9-highbitdepth"
+intel="--disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm"
 gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}"
 gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}"
 gen_config_files arm "--target=armv7-linux-gcc --disable-neon ${all_platforms}"
@@ -218,71 +273,31 @@ echo "Prepare Makefile."
 ./configure --target=generic-gnu > /dev/null
 make_clean
 
-echo "Generate X86 source list."
-config=$(print_config x86)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_x86 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86 > config_x86.bp
-
-echo "Generate X86_64 source list."
-config=$(print_config x86_64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_x86_64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/x86_64 > config_x86_64.bp
-
-echo "Generate ARM source list."
-config=$(print_config arm)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm > config_arm.bp
-
-echo "Generate ARM NEON source list."
-config=$(print_config arm-neon)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm_neon libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm-neon > config_arm-neon.bp
-
-echo "Generate ARM64 source list."
-config=$(print_config arm64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_arm64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/arm64 > config_arm64.bp
-
-echo "Generate MIPS source list."
-config=$(print_config_basic mips32)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32 > config_mips32.bp
-
-echo "Generate MIPS DSPR2 source list."
-config=$(print_config_basic mips32-dspr2)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32_dspr2 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-dspr2 > config_mips32-dispr2.bp
-
-echo "Generate MIPS MSA source list."
-config=$(print_config_basic mips32-msa)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips32_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips32-msa > config_mips32-msa.bp
-
-echo "Generate MIPS64 source list."
-config=$(print_config_basic mips64)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips64 libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64 > config_mips64.bp
-
-echo "Generate MIPS64 MSA source list."
-config=$(print_config_basic mips64-msa)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_mips64_msa libvpx_srcs.txt $LIBVPX_CONFIG_DIR/mips64-msa > config_mips64-msa.bp
-
-echo "Generate GENERIC source list."
-config=$(print_config_basic generic)
-make_clean
-make libvpx_srcs.txt target=libs $config > /dev/null
-gen_bp_srcs libvpx_generic libvpx_srcs.txt $LIBVPX_CONFIG_DIR/generic > config_generic.bp
+echo "Generate source lists"
+gen_source_list x86
+gen_source_list x86_64
+gen_source_list arm
+gen_source_list arm-neon
+gen_source_list arm64
+gen_source_list mips32
+gen_source_list mips32-dspr2
+gen_source_list mips32-msa
+gen_source_list mips64
+gen_source_list mips64-msa
+gen_source_list generic
+
+echo "Convert to bp"
+gen_bp_srcs x86
+gen_bp_srcs x86_64
+gen_bp_srcs arm
+gen_bp_srcs_with_excludes arm-neon arm
+gen_bp_srcs arm64
+gen_bp_srcs mips32
+gen_bp_srcs_with_excludes mips32-dspr2 mips32
+gen_bp_srcs_with_excludes mips32-msa mips32
+gen_bp_srcs mips64
+gen_bp_srcs_with_excludes mips64-msa mips64
+gen_bp_srcs generic
 
 rm -f $BASE_DIR/Android.bp
 (
diff --git a/libvpx/.clang-format b/libvpx/.clang-format
index 7837b7704..c1483199e 100644
--- a/libvpx/.clang-format
+++ b/libvpx/.clang-format
@@ -1,7 +1,7 @@
 ---
 Language:        Cpp
 # BasedOnStyle:  Google
-# Generated with clang-format 3.9.1
+# Generated with clang-format 4.0.1
 AccessModifierOffset: -1
 AlignAfterOpenBracket: Align
 AlignConsecutiveAssignments: false
@@ -60,6 +60,8 @@ IncludeIsMainRegex: '([-_](test|unittest))?$'
 IndentCaseLabels: true
 IndentWidth:     2
 IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
@@ -78,6 +80,7 @@ PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
 SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
 SpaceInEmptyParentheses: false
diff --git a/libvpx/.mailmap b/libvpx/.mailmap
index 166c45ee8..29af51065 100644
--- a/libvpx/.mailmap
+++ b/libvpx/.mailmap
@@ -3,6 +3,7 @@ Aℓex Converse <aconverse@google.com>
 Aℓex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Chris Cunningham <chcunningham@chromium.org>
 Daniele Castagna <dcastagna@chromium.org> <dcastagna@google.com>
 Deb Mukherjee <debargha@google.com>
 Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
@@ -21,18 +22,21 @@ Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
+Peter Boström <pbos@chromium.org> <pbos@google.com>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Peter de Rivaz <peter.derivaz@gmail.com> <peter.derivaz@argondesign.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Ronald S. Bultje <rsbultje@gmail.com> <rbultje@google.com>
 Sami Pietilä <samipietila@google.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Tamar Levy <tamar.levy@intel.com>
 Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Tom Finegan <tomfinegan@google.com> <tomfinegan@chromium.org>
+Urvang Joshi <urvang@google.com> <urvang@chromium.org>
 Yaowu Xu <yaowu@google.com> <adam@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index 87a5e845c..04c287243 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS
@@ -3,13 +3,13 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
-Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Aℓex Converse <aconverse@google.com>
 Ahmad Sharif <asharif@google.com>
 Aleksey Vasenev <margtu-fivt@ya.ru>
 Alexander Potapenko <glider@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
+Alexandra Hájková <alexandra.khirnova@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -17,6 +17,7 @@ A.Mahfoodh <ab.mahfoodh@gmail.com>
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
 Angie Chiang <angiebird@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
@@ -24,7 +25,9 @@ Attila Nagy <attilanagy@google.com>
 Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
+Cheng Chen <chengchen@google.com>
 chm <chm@rock-chips.com>
+Chris Cunningham <chcunningham@chromium.org>
 Christian Duvivier <cduvivier@google.com>
 Daniele Castagna <dcastagna@chromium.org>
 Daniel Kang <ddkang@google.com>
@@ -46,10 +49,12 @@ Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
 Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
+Gregor Jasny <gjasny@gmail.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
+Han Shen <shenhan@google.com>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Krasin <krasin@chromium.org>
@@ -83,6 +88,7 @@ Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
 Lou Quillio <louquillio@google.com>
@@ -101,6 +107,7 @@ Mikhal Shemer <mikhal@google.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
+Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
 Nico Weber <thakis@chromium.org>
@@ -111,12 +118,15 @@ Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
 Pengchong Jin <pengchong@google.com>
-Peter Boström <pbos@google.com>
+Peter Boström <pbos@chromium.org>
+Peter Collingbourne <pcc@chromium.org>
 Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
+Rafael de Lucena Valle <rafaeldelucena@gmail.com>
+Rahul Chaudhry <rahulchaudhry@google.com>
 Ralph Giles <giles@xiph.org>
 Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
 Rob Bradford <rob@linux.intel.com>
@@ -131,9 +141,11 @@ Sean McGovern <gseanmcg@gmail.com>
 Sergey Kolomenkin <kolomenkin@gmail.com>
 Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
+Shiyou Yin <yinshiyou-hf@loongson.cn>
 Shunyao Li <shunyaoli@google.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
+Sylvestre Ledru <sylvestre@mozilla.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
@@ -146,6 +158,7 @@ Tom Finegan <tomfinegan@google.com>
 Tristan Matthews <le.businessman@gmail.com>
 Urvang Joshi <urvang@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
+Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index 7e7aec67a..2281394c8 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG
@@ -1,3 +1,28 @@
+2017-01-04 v1.7.0 "Mandarin Duck"
+  This release focused on high bit depth performance (10/12 bit) and vp9
+  encoding improvements.
+
+  - Upgrading:
+    This release is ABI incompatible due to new vp9 encoder features.
+
+    Frame parallel decoding for vp9 has been removed.
+
+  - Enhancements:
+    vp9 encoding supports additional threads with --row-mt. This can be greater
+    than the number of tiles.
+
+    Two new vp9 encoder options have been added:
+      --corpus-complexity
+      --tune-content=film
+
+    Additional tooling for respecting the vp9 "level" profiles has been added.
+
+  - Bug fixes:
+    A variety of fuzzing issues.
+    vp8 threading fix for ARM.
+    Codec control VP9_SET_SKIP_LOOP_FILTER fixed.
+    Reject invalid multi resolution configurations.
+
 2017-01-09 v1.6.1 "Long Tailed Duck"
   This release improves upon the VP9 encoder and speeds up the encoding and
   decoding processes.
diff --git a/libvpx/README b/libvpx/README
index f910ce761..73304dd62 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -1,4 +1,4 @@
-README - 26 January 2017
+README - 24 January 2018
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -63,6 +63,8 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv8-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
+    ppc64-linux-gcc
+    ppc64le-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index 90522e5f6..f6b3f0630 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile
@@ -139,6 +139,8 @@ $(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
 $(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
 $(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
+$(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index fbe8b1b45..683b43037 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -403,6 +403,23 @@ check_gcc_machine_option() {
   fi
 }
 
+# tests for -m$2, -m$3, -m$4... toggling the feature given in $1.
+check_gcc_machine_options() {
+  feature="$1"
+  shift
+  flags="-m$1"
+  shift
+  for opt in $*; do
+    flags="$flags -m$opt"
+  done
+
+  if enabled gcc && ! disabled "$feature" && ! check_cflags $flags; then
+    RTCD_OPTIONS="${RTCD_OPTIONS}--disable-$feature "
+  else
+    soft_enable "$feature"
+  fi
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -702,6 +719,12 @@ process_common_toolchain() {
       power*)
         tgt_isa=ppc
         ;;
+      *mips64el*)
+        tgt_isa=mips64
+        ;;
+      *mips32el*)
+        tgt_isa=mips32
+        ;;
     esac
 
     # detect tgt_os
@@ -1163,6 +1186,11 @@ EOF
         fi
       fi
 
+      if enabled mmi; then
+        tgt_isa=loongson3a
+        check_add_ldflags -march=loongson3a
+      fi
+
       check_add_cflags -march=${tgt_isa}
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
@@ -1227,6 +1255,13 @@ EOF
           msvs_arch_dir=x86-msvs
           vc_version=${tgt_cc##vs}
           case $vc_version in
+            7|8|9|10|11|12|13|14)
+              echo "${tgt_cc} does not support avx512, disabling....."
+              RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx512 "
+              soft_disable avx512
+              ;;
+          esac
+          case $vc_version in
             7|8|9|10)
               echo "${tgt_cc} does not support avx/avx2, disabling....."
               RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
@@ -1270,9 +1305,18 @@ EOF
         elif disabled $ext; then
           disable_exts="yes"
         else
-          # use the shortened version for the flag: sse4_1 -> sse4
-          check_gcc_machine_option ${ext%_*} $ext
+          if [ "$ext" = "avx512" ]; then
+            check_gcc_machine_options $ext avx512f avx512cd avx512bw avx512dq avx512vl
+          else
+            # use the shortened version for the flag: sse4_1 -> sse4
+            check_gcc_machine_option ${ext%_*} $ext
+          fi
         fi
+
+        # https://bugs.chromium.org/p/webm/issues/detail?id=1464
+        # The assembly optimizations for vpx_sub_pixel_variance do not link with
+        # gcc 6.
+        enabled sse2 && soft_enable pic
       done
 
       if enabled external_build; then
@@ -1297,7 +1341,6 @@ EOF
         esac
         log_echo "  using $AS"
       fi
-      [ "${AS##*/}" = nasm ] && add_asflags -Ox
       AS_SFX=.asm
       case  ${tgt_os} in
         win32)
@@ -1306,7 +1349,7 @@ EOF
           EXE_SFX=.exe
           ;;
         win64)
-          add_asflags -f x64
+          add_asflags -f win64
           enabled debug && add_asflags -g cv8
           EXE_SFX=.exe
           ;;
@@ -1440,6 +1483,10 @@ EOF
           echo "msa optimizations are available only for little endian platforms"
           disable_feature msa
         fi
+        if enabled mmi; then
+          echo "mmi optimizations are available only for little endian platforms"
+          disable_feature mmi
+        fi
       fi
       ;;
   esac
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index 8b68038b3..401223a0b 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -240,10 +240,10 @@ case "${vs_ver:-10}" in
     12) sln_vers="12.00"
        sln_vers_str="Visual Studio 2013"
     ;;
-    14) sln_vers="14.00"
+    14) sln_vers="12.00"
        sln_vers_str="Visual Studio 2015"
     ;;
-    15) sln_vers="15.00"
+    15) sln_vers="12.00"
        sln_vers_str="Visual Studio 2017"
     ;;
 esac
diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl
index ce88e6480..68e92b52c 100755
--- a/libvpx/build/make/rtcd.pl
+++ b/libvpx/build/make/rtcd.pl
@@ -1,4 +1,13 @@
 #!/usr/bin/env perl
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
 
 no strict 'refs';
 use warnings;
@@ -200,6 +209,7 @@ sub filter {
 sub common_top() {
   my $include_guard = uc($opts{sym})."_H_";
   print <<EOF;
+// This file is generated. Do not edit.
 #ifndef ${include_guard}
 #define ${include_guard}
 
@@ -391,10 +401,10 @@ EOF
 
 &require("c");
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2 avx512/);
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
@@ -411,6 +421,10 @@ if ($opts{arch} eq 'x86') {
       @ALL_ARCHS = filter("$opts{arch}", qw/msa/);
       last;
     }
+    if (/HAVE_MMI=yes/) {
+      @ALL_ARCHS = filter("$opts{arch}", qw/mmi/);
+      last;
+    }
   }
   close CONFIG_FILE;
   mips;
diff --git a/libvpx/build/make/version.sh b/libvpx/build/make/version.sh
index 696752777..f36ede10f 100755
--- a/libvpx/build/make/version.sh
+++ b/libvpx/build/make/version.sh
@@ -60,6 +60,7 @@ if [ ${bare} ]; then
     echo "${changelog_version}${git_version_id}" > $$.tmp
 else
     cat<<EOF>$$.tmp
+// This file is generated. Do not edit.
 #define VERSION_MAJOR  $major_version
 #define VERSION_MINOR  $minor_version
 #define VERSION_PATCH  $patch_version
diff --git a/libvpx/configure b/libvpx/configure
index 090d3fb1e..e5a74c6f2 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -170,11 +170,14 @@ for t in ${all_targets}; do
     [ -f "${source_path}/${t}.mk" ] && enable_feature ${t}
 done
 
+if ! diff --version >/dev/null; then
+  die "diff missing: Try installing diffutils via your package manager."
+fi
+
 if ! perl --version >/dev/null; then
     die "Perl is required to build"
 fi
 
-
 if [ "`cd \"${source_path}\" && pwd`" != "`pwd`" ]; then
   # test to see if source_path already configured
   if [ -f "${source_path}/vpx_config.h" ]; then
@@ -241,7 +244,13 @@ ARCH_EXT_LIST_X86="
     sse4_1
     avx
     avx2
+    avx512
 "
+
+ARCH_EXT_LIST_LOONGSON="
+    mmi
+"
+
 ARCH_EXT_LIST="
     neon
     neon_asm
@@ -254,6 +263,8 @@ ARCH_EXT_LIST="
     ${ARCH_EXT_LIST_X86}
 
     vsx
+
+    ${ARCH_EXT_LIST_LOONGSON}
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
@@ -319,6 +330,7 @@ CONFIG_LIST="
     better_hw_compatibility
     experimental
     size_limit
+    always_adjust_bpm
     ${EXPERIMENT_LIST}
 "
 CMDLINE_SELECT="
@@ -378,6 +390,7 @@ CMDLINE_SELECT="
     better_hw_compatibility
     vp9_highbitdepth
     experimental
+    always_adjust_bpm
 "
 
 process_cmdline() {
@@ -579,6 +592,7 @@ process_toolchain() {
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
         check_add_cflags -Wcast-qual
@@ -651,7 +665,7 @@ process_toolchain() {
              gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
              enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
              all_targets="${all_targets} solution"
-             INLINE="__forceinline"
+             INLINE="__inline"
         ;;
     esac
 
diff --git a/libvpx/examples/vp8_multi_resolution_encoder.c b/libvpx/examples/vp8_multi_resolution_encoder.c
index 0b9663c77..b14b1ff39 100644
--- a/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -151,7 +151,7 @@ static void write_ivf_frame_header(FILE *outfile,
   if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
 
   pts = pkt->data.frame.pts;
-  mem_put_le32(header, pkt->data.frame.sz);
+  mem_put_le32(header, (int)pkt->data.frame.sz);
   mem_put_le32(header + 4, pts & 0xFFFFFFFF);
   mem_put_le32(header + 8, pts >> 32);
 
@@ -190,7 +190,7 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[0] = 0;
       cfg->ts_layer_id[1] = 1;
       // Use 60/40 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.6f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.6f * bitrate);
       cfg->ts_target_bitrate[1] = bitrate;
 
       /* 0=L, 1=GF */
@@ -241,8 +241,8 @@ static void set_temporal_layer_pattern(int num_temporal_layers,
       cfg->ts_layer_id[2] = 1;
       cfg->ts_layer_id[3] = 2;
       // Use 45/20/35 bit allocation as example.
-      cfg->ts_target_bitrate[0] = 0.45f * bitrate;
-      cfg->ts_target_bitrate[1] = 0.65f * bitrate;
+      cfg->ts_target_bitrate[0] = (int)(0.45f * bitrate);
+      cfg->ts_target_bitrate[1] = (int)(0.65f * bitrate);
       cfg->ts_target_bitrate[2] = bitrate;
 
       /* 0=L, 1=GF, 2=ARF */
@@ -294,8 +294,8 @@ int main(int argc, char **argv) {
   vpx_codec_err_t res[NUM_ENCODERS];
 
   int i;
-  long width;
-  long height;
+  int width;
+  int height;
   int length_frame;
   int frame_avail;
   int got_data;
@@ -347,9 +347,9 @@ int main(int argc, char **argv) {
 
   printf("Using %s\n", vpx_codec_iface_name(interface));
 
-  width = strtol(argv[1], NULL, 0);
-  height = strtol(argv[2], NULL, 0);
-  framerate = strtol(argv[3], NULL, 0);
+  width = (int)strtol(argv[1], NULL, 0);
+  height = (int)strtol(argv[2], NULL, 0);
+  framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
     die("Invalid resolution: %ldx%ld", width, height);
@@ -371,12 +371,13 @@ int main(int argc, char **argv) {
 
   // Bitrates per spatial layer: overwrite default rates above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    target_bitrate[i] = strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
+    target_bitrate[i] = (int)strtol(argv[NUM_ENCODERS + 5 + i], NULL, 0);
   }
 
   // Temporal layers per spatial layers: overwrite default settings above.
   for (i = 0; i < NUM_ENCODERS; i++) {
-    num_temporal_layers[i] = strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
+    num_temporal_layers[i] =
+        (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
           num_temporal_layers);
@@ -391,9 +392,9 @@ int main(int argc, char **argv) {
     downsampled_input[i] = fopen(filename, "wb");
   }
 
-  key_frame_insert = strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
+  key_frame_insert = (int)strtol(argv[3 * NUM_ENCODERS + 5], NULL, 0);
 
-  show_psnr = strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
+  show_psnr = (int)strtol(argv[3 * NUM_ENCODERS + 6], NULL, 0);
 
   /* Populate default encoder configuration */
   for (i = 0; i < NUM_ENCODERS; i++) {
@@ -469,7 +470,7 @@ int main(int argc, char **argv) {
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
       die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
 
-  if (raw[0].stride[VPX_PLANE_Y] == raw[0].d_w)
+  if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
     read_frame_p = read_frame;
   else
     read_frame_p = read_frame_by_row;
@@ -558,7 +559,8 @@ int main(int argc, char **argv) {
         /* Write out down-sampled input. */
         length_frame = cfg[i].g_w * cfg[i].g_h * 3 / 2;
         if (fwrite(raw[i].planes[0], 1, length_frame,
-                   downsampled_input[NUM_ENCODERS - i - 1]) != length_frame) {
+                   downsampled_input[NUM_ENCODERS - i - 1]) !=
+            (unsigned int)length_frame) {
           return EXIT_FAILURE;
         }
       }
@@ -619,10 +621,6 @@ int main(int argc, char **argv) {
             break;
           default: break;
         }
-        printf(pkt[i]->kind == VPX_CODEC_CX_FRAME_PKT &&
-                       (pkt[i]->data.frame.flags & VPX_FRAME_IS_KEY)
-                   ? "K"
-                   : "");
         fflush(stdout);
       }
     }
@@ -663,7 +661,6 @@ int main(int argc, char **argv) {
       write_ivf_file_header(outfile[i], &cfg[i], frame_cnt - 1);
     fclose(outfile[i]);
   }
-  printf("\n");
 
   return EXIT_SUCCESS;
 }
diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index 1f5078aad..0987cbfb8 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -168,7 +168,7 @@ void usage_exit(void) {
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
-  struct arg arg = { 0 };
+  struct arg arg;
   char **argv = NULL;
   char **argi = NULL;
   char **argj = NULL;
@@ -509,7 +509,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc,
 }
 
 vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
-                                       uint32_t sizes[8], int *count) {
+                                       uint64_t sizes[8], int *count) {
   // A chunk ending with a byte matching 0xc0 is an invalid chunk unless
   // it is a super frame index. If the last byte of real video compression
   // data is 0xc0 the encoder must add a 0 byte. If we have the marker but
@@ -606,9 +606,9 @@ void set_frame_flags_bypass_mode(int sl, int tl, int num_spatial_layers,
 }
 
 int main(int argc, const char **argv) {
-  AppInput app_input = { 0 };
+  AppInput app_input;
   VpxVideoWriter *writer = NULL;
-  VpxVideoInfo info = { 0 };
+  VpxVideoInfo info;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
@@ -640,8 +640,9 @@ int main(int argc, const char **argv) {
 
 // Allocate image buffer
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (!vpx_img_alloc(&raw, enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
-                                                          : VPX_IMG_FMT_I42016,
+  if (!vpx_img_alloc(&raw,
+                     enc_cfg.g_input_bit_depth == 8 ? VPX_IMG_FMT_I420
+                                                    : VPX_IMG_FMT_I42016,
                      enc_cfg.g_w, enc_cfg.g_h, 32)) {
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
   }
@@ -699,12 +700,16 @@ int main(int argc, const char **argv) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, svc_ctx.speed);
   if (svc_ctx.threads) {
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1));
-    vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
+    if (svc_ctx.threads > 1)
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
+    else
+      vpx_codec_control(&codec, VP9E_SET_ROW_MT, 0);
   }
   if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1)
     vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
   if (svc_ctx.speed >= 5)
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
+  vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT, 900);
 
   // Encode frames
   while (!end_of_stream) {
@@ -765,7 +770,7 @@ int main(int argc, const char **argv) {
           SvcInternal_t *const si = (SvcInternal_t *)svc_ctx.internal;
           if (cx_pkt->data.frame.sz > 0) {
 #if OUTPUT_RC_STATS
-            uint32_t sizes[8];
+            uint64_t sizes[8];
             int count = 0;
 #endif
             vpx_video_writer_write_frame(writer, cx_pkt->data.frame.buf,
@@ -777,6 +782,8 @@ int main(int argc, const char **argv) {
               vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id);
               parse_superframe_index(cx_pkt->data.frame.buf,
                                      cx_pkt->data.frame.sz, sizes, &count);
+              if (enc_cfg.ss_number_layers == 1)
+                sizes[0] = cx_pkt->data.frame.sz;
               // Note computing input_layer_frames here won't account for frame
               // drops in rate control stats.
               // TODO(marpan): Fix this for non-bypass mode so we can get stats
diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c
index c34673b05..f5736ea45 100644
--- a/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -26,17 +26,27 @@
 #include "../tools_common.h"
 #include "../video_writer.h"
 
+#define VP8_ROI_MAP 0
+
 static const char *exec_name;
 
 void usage_exit(void) { exit(EXIT_FAILURE); }
 
-// Denoiser states, for temporal denoising.
-enum denoiserState {
-  kDenoiserOff,
-  kDenoiserOnYOnly,
-  kDenoiserOnYUV,
-  kDenoiserOnYUVAggressive,
-  kDenoiserOnAdaptive
+// Denoiser states for vp8, for temporal denoising.
+enum denoiserStateVp8 {
+  kVp8DenoiserOff,
+  kVp8DenoiserOnYOnly,
+  kVp8DenoiserOnYUV,
+  kVp8DenoiserOnYUVAggressive,
+  kVp8DenoiserOnAdaptive
+};
+
+// Denoiser states for vp9, for temporal denoising.
+enum denoiserStateVp9 {
+  kVp9DenoiserOff,
+  kVp9DenoiserOnYOnly,
+  // For SVC: denoise the top two spatial layers.
+  kVp9DenoiserOnYTwoSpatialLayers
 };
 
 static int mode_to_num_layers[13] = { 1, 2, 2, 3, 3, 3, 3, 5, 2, 3, 3, 3, 3 };
@@ -154,6 +164,53 @@ static void printout_rate_control_summary(struct RateControlMetrics *rc,
     die("Error: Number of input frames not equal to output! \n");
 }
 
+#if VP8_ROI_MAP
+static void vp8_set_roi_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi) {
+  unsigned int i, j;
+  memset(roi, 0, sizeof(*roi));
+
+  // ROI is based on the segments (4 for vp8, 8 for vp9), smallest unit for
+  // segment is 16x16 for vp8, 8x8 for vp9.
+  roi->rows = (cfg->g_h + 15) / 16;
+  roi->cols = (cfg->g_w + 15) / 16;
+
+  // Applies delta QP on the segment blocks, varies from -63 to 63.
+  // Setting to negative means lower QP (better quality).
+  // Below we set delta_q to the extreme (-63) to show strong effect.
+  roi->delta_q[0] = 0;
+  roi->delta_q[1] = -63;
+  roi->delta_q[2] = 0;
+  roi->delta_q[3] = 0;
+
+  // Applies delta loopfilter strength on the segment blocks, varies from -63 to
+  // 63. Setting to positive means stronger loopfilter.
+  roi->delta_lf[0] = 0;
+  roi->delta_lf[1] = 0;
+  roi->delta_lf[2] = 0;
+  roi->delta_lf[3] = 0;
+
+  // Applies skip encoding threshold on the segment blocks, varies from 0 to
+  // UINT_MAX. Larger value means more skipping of encoding is possible.
+  // This skip threshold only applies on delta frames.
+  roi->static_threshold[0] = 0;
+  roi->static_threshold[1] = 0;
+  roi->static_threshold[2] = 0;
+  roi->static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      if (i > (roi->rows >> 2) && i < ((roi->rows * 3) >> 2) &&
+          j > (roi->cols >> 2) && j < ((roi->cols * 3) >> 2)) {
+        roi->roi_map[i * roi->cols + j] = 1;
+      }
+    }
+  }
+}
+#endif
+
 // Temporal scaling parameters:
 // NOTE: The 3 prediction frames cannot be used interchangeably due to
 // differences in the way they are handled throughout the code. The
@@ -506,11 +563,10 @@ int main(int argc, char **argv) {
   int layering_mode = 0;
   int layer_flags[VPX_TS_MAX_PERIODICITY] = { 0 };
   int flag_periodicity = 1;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
-  vpx_svc_layer_id_t layer_id = { 0, 0 };
-#else
-  vpx_svc_layer_id_t layer_id = { 0 };
+#if VP8_ROI_MAP
+  vpx_roi_map_t roi;
 #endif
+  vpx_svc_layer_id_t layer_id = { 0, 0 };
   const VpxInterface *encoder = NULL;
   FILE *infile = NULL;
   struct RateControlMetrics rc;
@@ -637,7 +693,7 @@ int main(int argc, char **argv) {
   if (strncmp(encoder->name, "vp9", 3) == 0) cfg.rc_max_quantizer = 52;
   cfg.rc_undershoot_pct = 50;
   cfg.rc_overshoot_pct = 50;
-  cfg.rc_buf_initial_sz = 500;
+  cfg.rc_buf_initial_sz = 600;
   cfg.rc_buf_optimal_sz = 600;
   cfg.rc_buf_sz = 1000;
 
@@ -707,9 +763,15 @@ int main(int argc, char **argv) {
 
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
-    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0);
+#if VP8_ROI_MAP
+    vp8_set_roi_map(&cfg, &roi);
+    if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
+      die_codec(&codec, "Failed to set ROI map");
+#endif
+
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
@@ -718,7 +780,7 @@ int main(int argc, char **argv) {
     vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0);
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
-    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff);
+    vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kVp9DenoiserOff);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1);
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
@@ -746,7 +808,7 @@ int main(int argc, char **argv) {
   // For generating smaller key frames, use a smaller max_intra_size_pct
   // value, like 100 or 200.
   {
-    const int max_intra_size_pct = 900;
+    const int max_intra_size_pct = 1000;
     vpx_codec_control(&codec, VP8E_SET_MAX_INTRA_BITRATE_PCT,
                       max_intra_size_pct);
   }
@@ -756,10 +818,8 @@ int main(int argc, char **argv) {
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
-#if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
-#endif
     layer_id.temporal_layer_id =
         cfg.ts_layer_id[frame_cnt % cfg.ts_periodicity];
     if (strncmp(encoder->name, "vp9", 3) == 0) {
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index f1e924253..a3e2f9d0e 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -188,6 +188,13 @@ libvpx_srcs.txt:
 	@echo $(CODEC_SRCS) | xargs -n1 echo | LC_ALL=C sort -u > $@
 CLEAN-OBJS += libvpx_srcs.txt
 
+# Assembly files that are included, but don't define symbols themselves.
+# Filtered out to avoid Windows build warnings.
+ASM_INCLUDES := \
+    third_party/x86inc/x86inc.asm \
+    vpx_config.asm \
+    vpx_ports/x86_abi_support.asm \
+    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
 
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
@@ -199,14 +206,6 @@ vpx.def: $(call enabled,CODEC_EXPORTS)
             --out=$@ $^
 CLEAN-OBJS += vpx.def
 
-# Assembly files that are included, but don't define symbols themselves.
-# Filtered out to avoid Visual Studio build warnings.
-ASM_INCLUDES := \
-    third_party/x86inc/x86inc.asm \
-    vpx_config.asm \
-    vpx_ports/x86_abi_support.asm \
-    vpx_dsp/x86/bitdepth_conversion_sse2.asm \
-
 vpx.$(VCPROJ_SFX): $(CODEC_SRCS) vpx.def
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
@@ -229,13 +228,13 @@ vpx.$(VCPROJ_SFX): $(RTCD)
 
 endif
 else
-LIBVPX_OBJS=$(call objs,$(CODEC_SRCS))
+LIBVPX_OBJS=$(call objs, $(filter-out $(ASM_INCLUDES), $(CODEC_SRCS)))
 OBJS-yes += $(LIBVPX_OBJS)
 LIBS-$(if yes,$(CONFIG_STATIC)) += $(BUILD_PFX)libvpx.a $(BUILD_PFX)libvpx_g.a
 $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 
-SO_VERSION_MAJOR := 4
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 5
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -406,8 +405,16 @@ CLEAN-OBJS += libvpx_test_srcs.txt
 
 $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	@echo "    [DOWNLOAD] $@"
-	$(qexec)trap 'rm -f $@' INT TERM &&\
-            curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))
+	# Attempt to download the file using curl, retrying once if it fails for a
+	# partial file (18).
+	$(qexec)( \
+	  trap 'rm -f $@' INT TERM; \
+	  curl="curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F))"; \
+	  $$curl; \
+	  case "$$?" in \
+	    18) $$curl -C -;; \
+	  esac \
+	)
 
 testdata:: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h
index c2f6b0e41..d915cf913 100644
--- a/libvpx/test/acm_random.h
+++ b/libvpx/test/acm_random.h
@@ -11,6 +11,10 @@
 #ifndef TEST_ACM_RANDOM_H_
 #define TEST_ACM_RANDOM_H_
 
+#include <assert.h>
+
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "vpx/vpx_integer.h"
@@ -50,6 +54,13 @@ class ACMRandom {
     return r < 128 ? r << 4 : r >> 4;
   }
 
+  uint32_t RandRange(const uint32_t range) {
+    // testing::internal::Random::Generate provides values in the range
+    // testing::internal::Random::kMaxRange.
+    assert(range <= testing::internal::Random::kMaxRange);
+    return random_.Generate(range);
+  }
+
   int PseudoUniform(int range) { return random_.Generate(range); }
 
   int operator()(int n) { return PseudoUniform(n); }
diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk
index 48872a2b6..7318de2fc 100644
--- a/libvpx/test/android/Android.mk
+++ b/libvpx/test/android/Android.mk
@@ -32,6 +32,7 @@ LOCAL_CPP_EXTENSION := .cc
 LOCAL_MODULE := gtest
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
 LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include/
+LOCAL_EXPORT_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/include/
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
 include $(BUILD_STATIC_LIBRARY)
 
diff --git a/libvpx/test/avg_test.cc b/libvpx/test/avg_test.cc
index c570bbc22..ad21198e4 100644
--- a/libvpx/test/avg_test.cc
+++ b/libvpx/test/avg_test.cc
@@ -23,6 +23,7 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -367,6 +368,21 @@ TEST_P(SatdTest, Random) {
   Check(expected);
 }
 
+TEST_P(SatdTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
 TEST_P(BlockErrorTestFP, MinValue) {
   const int64_t kMin = -32640;
   const int64_t expected = kMin * kMin * txfm_size_;
@@ -396,6 +412,22 @@ TEST_P(BlockErrorTestFP, Random) {
   Check(expected);
 }
 
+TEST_P(BlockErrorTestFP, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 20000;
+  vpx_usec_timer timer;
+  DECLARE_ALIGNED(16, tran_low_t, coeff[1024]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[1024]);
+  const int blocksize = GET_PARAM(0);
+
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    GET_PARAM(1)(coeff, dqcoeff, blocksize);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("blocksize: %4d time: %4d us\n", blocksize, elapsed_time);
+}
+
 using std::tr1::make_tuple;
 
 INSTANTIATE_TEST_CASE_P(
@@ -454,6 +486,21 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(1024, &vp9_block_error_fp_sse2)));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, SatdTest,
+                        ::testing::Values(make_tuple(16, &vpx_satd_avx2),
+                                          make_tuple(64, &vpx_satd_avx2),
+                                          make_tuple(256, &vpx_satd_avx2),
+                                          make_tuple(1024, &vpx_satd_avx2)));
+
+INSTANTIATE_TEST_CASE_P(
+    AVX2, BlockErrorTestFP,
+    ::testing::Values(make_tuple(16, &vp9_block_error_fp_avx2),
+                      make_tuple(64, &vp9_block_error_fp_avx2),
+                      make_tuple(256, &vp9_block_error_fp_avx2),
+                      make_tuple(1024, &vp9_block_error_fp_avx2)));
+#endif
+
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(
     NEON, AverageTest,
diff --git a/libvpx/test/buffer.h b/libvpx/test/buffer.h
index 75016c91e..2175dad9d 100644
--- a/libvpx/test/buffer.h
+++ b/libvpx/test/buffer.h
@@ -19,6 +19,7 @@
 
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 namespace libvpx_test {
 
@@ -29,29 +30,55 @@ class Buffer {
          int right_padding, int bottom_padding)
       : width_(width), height_(height), top_padding_(top_padding),
         left_padding_(left_padding), right_padding_(right_padding),
-        bottom_padding_(bottom_padding) {
-    Init();
-  }
+        bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  Buffer(int width, int height, int top_padding, int left_padding,
+         int right_padding, int bottom_padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(top_padding),
+        left_padding_(left_padding), right_padding_(right_padding),
+        bottom_padding_(bottom_padding), alignment_(alignment),
+        padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
+        raw_buffer_(NULL) {}
 
   Buffer(int width, int height, int padding)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
-        bottom_padding_(padding) {
-    Init();
-  }
+        bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
+        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
 
-  ~Buffer() { delete[] raw_buffer_; }
+  Buffer(int width, int height, int padding, unsigned int alignment)
+      : width_(width), height_(height), top_padding_(padding),
+        left_padding_(padding), right_padding_(padding),
+        bottom_padding_(padding), alignment_(alignment), padding_value_(0),
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+
+  ~Buffer() {
+    if (alignment_) {
+      vpx_free(raw_buffer_);
+    } else {
+      delete[] raw_buffer_;
+    }
+  }
 
   T *TopLeftPixel() const;
 
   int stride() const { return stride_; }
 
   // Set the buffer (excluding padding) to 'value'.
-  void Set(const int value);
+  void Set(const T value);
 
-  // Set the buffer (excluding padding) to the output of ACMRandom function 'b'.
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'rand_func'.
   void Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)());
 
+  // Set the buffer (excluding padding) to the output of ACMRandom function
+  // 'RandRange' with range 'low' to 'high' which typically must be within
+  // testing::internal::Random::kMaxRange (1u << 31). However, because we want
+  // to allow negative low (and high) values, it is restricted to INT32_MAX
+  // here.
+  void Set(ACMRandom *rand_class, const T low, const T high);
+
   // Copy the contents of Buffer 'a' (excluding padding).
   void CopyFrom(const Buffer<T> &a);
 
@@ -63,11 +90,11 @@ class Buffer {
   bool HasPadding() const;
 
   // Sets all the values in the buffer to 'padding_value'.
-  void SetPadding(const int padding_value);
+  void SetPadding(const T padding_value);
 
   // Checks if all the values (excluding padding) are equal to 'value' if the
   // Buffers are the same size.
-  bool CheckValues(const int value) const;
+  bool CheckValues(const T value) const;
 
   // Check that padding matches the expected value or there is no padding.
   bool CheckPadding() const;
@@ -75,21 +102,36 @@ class Buffer {
   // Compare the non-padding portion of two buffers if they are the same size.
   bool CheckValues(const Buffer<T> &a) const;
 
- private:
-  void Init() {
-    ASSERT_GT(width_, 0);
-    ASSERT_GT(height_, 0);
-    ASSERT_GE(top_padding_, 0);
-    ASSERT_GE(left_padding_, 0);
-    ASSERT_GE(right_padding_, 0);
-    ASSERT_GE(bottom_padding_, 0);
+  bool Init() {
+    if (raw_buffer_ != NULL) return false;
+    EXPECT_GT(width_, 0);
+    EXPECT_GT(height_, 0);
+    EXPECT_GE(top_padding_, 0);
+    EXPECT_GE(left_padding_, 0);
+    EXPECT_GE(right_padding_, 0);
+    EXPECT_GE(bottom_padding_, 0);
     stride_ = left_padding_ + width_ + right_padding_;
-    raw_size_ = stride_ * (top_padding_ + height_ + bottom_padding_);
-    raw_buffer_ = new (std::nothrow) T[raw_size_];
-    ASSERT_TRUE(raw_buffer_ != NULL);
+    num_elements_ = stride_ * (top_padding_ + height_ + bottom_padding_);
+    raw_size_ = num_elements_ * sizeof(T);
+    if (alignment_) {
+      EXPECT_GE(alignment_, sizeof(T));
+      // Ensure alignment of the first value will be preserved.
+      EXPECT_EQ((left_padding_ * sizeof(T)) % alignment_, 0u);
+      // Ensure alignment of the subsequent rows will be preserved when there is
+      // a stride.
+      if (stride_ != width_) {
+        EXPECT_EQ((stride_ * sizeof(T)) % alignment_, 0u);
+      }
+      raw_buffer_ = reinterpret_cast<T *>(vpx_memalign(alignment_, raw_size_));
+    } else {
+      raw_buffer_ = new (std::nothrow) T[num_elements_];
+    }
+    EXPECT_TRUE(raw_buffer_ != NULL);
     SetPadding(std::numeric_limits<T>::max());
+    return !::testing::Test::HasFailure();
   }
 
+ private:
   bool BufferSizesMatch(const Buffer<T> &a) const;
 
   const int width_;
@@ -98,44 +140,70 @@ class Buffer {
   const int left_padding_;
   const int right_padding_;
   const int bottom_padding_;
-  int padding_value_;
+  const unsigned int alignment_;
+  T padding_value_;
   int stride_;
   int raw_size_;
+  int num_elements_;
   T *raw_buffer_;
 };
 
 template <typename T>
 T *Buffer<T>::TopLeftPixel() const {
-  return raw_buffer_ + (top_padding_ * stride()) + left_padding_;
+  if (!raw_buffer_) return NULL;
+  return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
 }
 
 template <typename T>
-void Buffer<T>::Set(const int value) {
+void Buffer<T>::Set(const T value) {
+  if (!raw_buffer_) return;
   T *src = TopLeftPixel();
   for (int height = 0; height < height_; ++height) {
     for (int width = 0; width < width_; ++width) {
       src[width] = value;
     }
-    src += stride();
+    src += stride_;
   }
 }
 
 template <typename T>
 void Buffer<T>::Set(ACMRandom *rand_class, T (ACMRandom::*rand_func)()) {
+  if (!raw_buffer_) return;
   T *src = TopLeftPixel();
   for (int height = 0; height < height_; ++height) {
     for (int width = 0; width < width_; ++width) {
       src[width] = (*rand_class.*rand_func)();
     }
-    src += stride();
+    src += stride_;
   }
 }
 
 template <typename T>
-void Buffer<T>::CopyFrom(const Buffer<T> &a) {
-  if (!BufferSizesMatch(a)) {
-    return;
+void Buffer<T>::Set(ACMRandom *rand_class, const T low, const T high) {
+  if (!raw_buffer_) return;
+
+  EXPECT_LE(low, high);
+  EXPECT_LE(static_cast<int64_t>(high) - low,
+            std::numeric_limits<int32_t>::max());
+
+  T *src = TopLeftPixel();
+  for (int height = 0; height < height_; ++height) {
+    for (int width = 0; width < width_; ++width) {
+      // 'low' will be promoted to unsigned given the return type of RandRange.
+      // Store the value as an int to avoid unsigned overflow warnings when
+      // 'low' is negative.
+      const int32_t value =
+          static_cast<int32_t>((*rand_class).RandRange(high - low));
+      src[width] = static_cast<T>(value + low);
+    }
+    src += stride_;
   }
+}
+
+template <typename T>
+void Buffer<T>::CopyFrom(const Buffer<T> &a) {
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
 
   T *a_src = a.TopLeftPixel();
   T *b_src = this->TopLeftPixel();
@@ -150,10 +218,11 @@ void Buffer<T>::CopyFrom(const Buffer<T> &a) {
 
 template <typename T>
 void Buffer<T>::DumpBuffer() const {
+  if (!raw_buffer_) return;
   for (int height = 0; height < height_ + top_padding_ + bottom_padding_;
        ++height) {
-    for (int width = 0; width < stride(); ++width) {
-      printf("%4d", raw_buffer_[height + width * stride()]);
+    for (int width = 0; width < stride_; ++width) {
+      printf("%4d", raw_buffer_[height + width * stride_]);
     }
     printf("\n");
   }
@@ -161,14 +230,14 @@ void Buffer<T>::DumpBuffer() const {
 
 template <typename T>
 bool Buffer<T>::HasPadding() const {
+  if (!raw_buffer_) return false;
   return top_padding_ || left_padding_ || right_padding_ || bottom_padding_;
 }
 
 template <typename T>
 void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
-  if (!BufferSizesMatch(a)) {
-    return;
-  }
+  if (!raw_buffer_) return;
+  if (!BufferSizesMatch(a)) return;
 
   T *a_src = a.TopLeftPixel();
   T *b_src = TopLeftPixel();
@@ -206,17 +275,19 @@ void Buffer<T>::PrintDifference(const Buffer<T> &a) const {
 }
 
 template <typename T>
-void Buffer<T>::SetPadding(const int padding_value) {
+void Buffer<T>::SetPadding(const T padding_value) {
+  if (!raw_buffer_) return;
   padding_value_ = padding_value;
 
   T *src = raw_buffer_;
-  for (int i = 0; i < raw_size_; ++i) {
+  for (int i = 0; i < num_elements_; ++i) {
     src[i] = padding_value;
   }
 }
 
 template <typename T>
-bool Buffer<T>::CheckValues(const int value) const {
+bool Buffer<T>::CheckValues(const T value) const {
+  if (!raw_buffer_) return false;
   T *src = TopLeftPixel();
   for (int height = 0; height < height_; ++height) {
     for (int width = 0; width < width_; ++width) {
@@ -224,20 +295,19 @@ bool Buffer<T>::CheckValues(const int value) const {
         return false;
       }
     }
-    src += stride();
+    src += stride_;
   }
   return true;
 }
 
 template <typename T>
 bool Buffer<T>::CheckPadding() const {
-  if (!HasPadding()) {
-    return true;
-  }
+  if (!raw_buffer_) return false;
+  if (!HasPadding()) return true;
 
   // Top padding.
   T const *top = raw_buffer_;
-  for (int i = 0; i < stride() * top_padding_; ++i) {
+  for (int i = 0; i < stride_ * top_padding_; ++i) {
     if (padding_value_ != top[i]) {
       return false;
     }
@@ -251,7 +321,7 @@ bool Buffer<T>::CheckPadding() const {
         return false;
       }
     }
-    left += stride();
+    left += stride_;
   }
 
   // Right padding.
@@ -262,12 +332,12 @@ bool Buffer<T>::CheckPadding() const {
         return false;
       }
     }
-    right += stride();
+    right += stride_;
   }
 
   // Bottom padding
-  T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride();
-  for (int i = 0; i < stride() * bottom_padding_; ++i) {
+  T const *bottom = raw_buffer_ + (top_padding_ + height_) * stride_;
+  for (int i = 0; i < stride_ * bottom_padding_; ++i) {
     if (padding_value_ != bottom[i]) {
       return false;
     }
@@ -278,9 +348,8 @@ bool Buffer<T>::CheckPadding() const {
 
 template <typename T>
 bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
-  if (!BufferSizesMatch(a)) {
-    return false;
-  }
+  if (!raw_buffer_) return false;
+  if (!BufferSizesMatch(a)) return false;
 
   T *a_src = a.TopLeftPixel();
   T *b_src = this->TopLeftPixel();
@@ -298,6 +367,7 @@ bool Buffer<T>::CheckValues(const Buffer<T> &a) const {
 
 template <typename T>
 bool Buffer<T>::BufferSizesMatch(const Buffer<T> &a) const {
+  if (!raw_buffer_) return false;
   if (a.width_ != this->width_ || a.height_ != this->height_) {
     printf(
         "Reference buffer of size %dx%d does not match this buffer which is "
diff --git a/libvpx/test/byte_alignment_test.cc b/libvpx/test/byte_alignment_test.cc
index d78294d10..5a058b275 100644
--- a/libvpx/test/byte_alignment_test.cc
+++ b/libvpx/test/byte_alignment_test.cc
@@ -128,8 +128,8 @@ class ByteAlignmentTest
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "MD5 file open failed. Filename: " << md5_file_name_;
   }
 
   void CheckMd5(const vpx_image_t &img) {
diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc
index 3feba7127..110e06583 100644
--- a/libvpx/test/comp_avg_pred_test.cc
+++ b/libvpx/test/comp_avg_pred_test.cc
@@ -15,7 +15,6 @@
 #include "test/acm_random.h"
 #include "test/buffer.h"
 #include "test/register_state_check.h"
-#include "vpx_ports/mem.h"
 #include "vpx_ports/vpx_timer.h"
 
 namespace {
@@ -28,12 +27,13 @@ typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
 
 uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
 
-void reference_pred(const uint8_t *pred, const Buffer<uint8_t> &ref, int width,
-                    int height, uint8_t *avg) {
+void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
+                    int width, int height, Buffer<uint8_t> *avg) {
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
-      avg[y * width + x] = avg_with_rounding(
-          pred[y * width + x], ref.TopLeftPixel()[y * ref.stride() + x]);
+      avg->TopLeftPixel()[y * avg->stride() + x] =
+          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
+                            ref.TopLeftPixel()[y * ref.stride() + x]);
     }
   }
 }
@@ -50,22 +50,10 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
   ACMRandom rnd_;
 };
 
-void fill(ACMRandom *r, uint8_t *a, const int width, const int height) {
-  for (int y = 0; y < height; ++y) {
-    for (int x = 0; x < width; ++x) {
-      a[x + width * y] = r->Rand8();
-    }
-  }
-}
-
 TEST_P(AvgPredTest, SizeCombinations) {
   // This is called as part of the sub pixel variance. As such it must be one of
   // the variance block sizes.
 
-  DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, avg_ref[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, avg_chk[64 * 64]);
-
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -80,15 +68,28 @@ TEST_P(AvgPredTest, SizeCombinations) {
         // Only the reference buffer may have a stride not equal to width.
         Buffer<uint8_t> ref =
             Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_ref.Init());
+        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg_chk.Init());
 
-        fill(&rnd_, pred, width, height);
         ref.Set(&rnd_, &ACMRandom::Rand8);
-
-        reference_pred(pred, ref, width, height, avg_ref);
-        ASM_REGISTER_STATE_CHECK(avg_pred_func_(
-            avg_chk, pred, width, height, ref.TopLeftPixel(), ref.stride()));
-        ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height),
-                  0);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
+
+        reference_pred(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(
+            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
+                           height, ref.TopLeftPixel(), ref.stride()));
+
+        EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+        if (HasFailure()) {
+          printf("Width: %d Height: %d\n", width, height);
+          avg_chk.PrintDifference(avg_ref);
+          return;
+        }
       }
     }
   }
@@ -98,25 +99,32 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
   const int width = 64;
   const int height = 32;
   Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
-  DECLARE_ALIGNED(16, uint8_t, pred[width * height]);
-  DECLARE_ALIGNED(16, uint8_t, avg_ref[width * height]);
-  DECLARE_ALIGNED(16, uint8_t, avg_chk[width * height]);
+  ASSERT_TRUE(ref.Init());
+  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(pred.Init());
+  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_ref.Init());
+  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
-    fill(&rnd_, pred, width, height);
     ref.Set(&rnd_, &ACMRandom::Rand8);
+    pred.Set(&rnd_, &ACMRandom::Rand8);
 
-    reference_pred(pred, ref, width, height, avg_ref);
-    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk, pred, width, height,
+    reference_pred(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
+                                            pred.TopLeftPixel(), width, height,
                                             ref.TopLeftPixel(), ref.stride()));
-    ASSERT_EQ(memcmp(avg_ref, avg_chk, sizeof(*avg_ref) * width * height), 0);
+    EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
+    if (HasFailure()) {
+      printf("Width: %d Height: %d\n", width, height);
+      avg_chk.PrintDifference(avg_ref);
+      return;
+    }
   }
 }
 
 TEST_P(AvgPredTest, DISABLED_Speed) {
-  DECLARE_ALIGNED(16, uint8_t, pred[64 * 64]);
-  DECLARE_ALIGNED(16, uint8_t, avg[64 * 64]);
-
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -128,15 +136,20 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
         const int height = 1 << height_pow;
         Buffer<uint8_t> ref =
             Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        ASSERT_TRUE(ref.Init());
+        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(pred.Init());
+        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        ASSERT_TRUE(avg.Init());
 
-        fill(&rnd_, pred, width, height);
         ref.Set(&rnd_, &ACMRandom::Rand8);
+        pred.Set(&rnd_, &ACMRandom::Rand8);
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
         for (int i = 0; i < 10000000 / (width * height); ++i) {
-          avg_pred_func_(avg, pred, width, height, ref.TopLeftPixel(),
-                         ref.stride());
+          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
+                         ref.TopLeftPixel(), ref.stride());
         }
         vpx_usec_timer_mark(&timer);
 
@@ -156,6 +169,12 @@ INSTANTIATE_TEST_CASE_P(C, AvgPredTest,
 INSTANTIATE_TEST_CASE_P(SSE2, AvgPredTest,
                         ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, AvgPredTest,
+                        ::testing::Values(&vpx_comp_avg_pred_neon));
+#endif  // HAVE_NEON
+
 #if HAVE_VSX
 INSTANTIATE_TEST_CASE_P(VSX, AvgPredTest,
                         ::testing::Values(&vpx_comp_avg_pred_vsx));
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index 535b9b07f..70f0b11a7 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h);
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h);
 
 typedef void (*WrapperFilterBlock2d8Func)(
     const uint8_t *src_ptr, const unsigned int src_stride,
@@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
 
   vpx_usec_timer_start(&timer);
   for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+    UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
                    width, height);
   }
   vpx_usec_timer_mark(&timer);
@@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
 
   vpx_usec_timer_start(&timer);
   for (int n = 0; n < kNumTests; ++n) {
-    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0,
+    UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0,
                    width, height);
   }
   vpx_usec_timer_mark(&timer);
@@ -580,12 +580,127 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) {
          UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
 }
 
+TEST_P(ConvolveTest, DISABLED_Scale_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                   width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve_scale_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Horiz_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->h8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_horiz_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Vert_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->v8_[0](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                 width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_vert_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
+TEST_P(ConvolveTest, DISABLED_8Tap_Avg_Speed) {
+  const uint8_t *const in = input();
+  uint8_t *const out = output();
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP_SHARP];
+  const int kNumTests = 5000000;
+  const int width = Width();
+  const int height = Height();
+  vpx_usec_timer timer;
+
+  SetConstantInput(127);
+
+  vpx_usec_timer_start(&timer);
+  for (int n = 0; n < kNumTests; ++n) {
+    UUT_->hv8_[1](in, kInputStride, out, kOutputStride, eighttap, 8, 16, 8, 16,
+                  width, height);
+  }
+  vpx_usec_timer_mark(&timer);
+
+  const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+  printf("convolve8_avg_%dx%d_%d: %d us\n", width, height,
+         UUT_->use_highbd_ ? UUT_->use_highbd_ : 8, elapsed_time);
+}
+
 TEST_P(ConvolveTest, Copy) {
   uint8_t *const in = input();
   uint8_t *const out = output();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -604,7 +719,7 @@ TEST_P(ConvolveTest, Avg) {
   CopyOutputToRef();
 
   ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride,
-                                          NULL, 0, NULL, 0, Width(), Height()));
+                                          NULL, 0, 0, 0, 0, Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -621,12 +736,10 @@ TEST_P(ConvolveTest, Avg) {
 TEST_P(ConvolveTest, CopyHoriz) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -641,12 +754,10 @@ TEST_P(ConvolveTest, CopyHoriz) {
 TEST_P(ConvolveTest, CopyVert) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride,
-                                         filter8, 16, filter8, 16, Width(),
-                                         Height()));
+                                         vp9_filter_kernels[0], 0, 16, 0, 16,
+                                         Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -661,12 +772,10 @@ TEST_P(ConvolveTest, CopyVert) {
 TEST_P(ConvolveTest, Copy2D) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
 
   ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride,
-                                          filter8, 16, filter8, 16, Width(),
-                                          Height()));
+                                          vp9_filter_kernels[0], 0, 16, 0, 16,
+                                          Width(), Height()));
 
   CheckGuardBlocks();
 
@@ -702,7 +811,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
   }
 }
 
-const int16_t kInvalidFilter[8] = { 0 };
 const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = {
   wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c
 };
@@ -755,21 +863,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) {
                                       Width(), Height(), UUT_->use_highbd_);
 
           if (filter_x && filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters,
+                              filter_x, 16, filter_y, 16, Width(), Height()));
           else if (filter_y)
-            ASM_REGISTER_STATE_CHECK(UUT_->v8_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                filters[filter_y], 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0,
+                             16, filter_y, 16, Width(), Height()));
           else if (filter_x)
-            ASM_REGISTER_STATE_CHECK(UUT_->h8_[i](
-                in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                kInvalidFilter, 16, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(
+                UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters,
+                             filter_x, 16, 0, 16, Width(), Height()));
           else
-            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](
-                in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                kInvalidFilter, 0, Width(), Height()));
+            ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out,
+                                                    kOutputStride, NULL, 0, 0,
+                                                    0, 0, Width(), Height()));
 
           CheckGuardBlocks();
 
@@ -853,21 +961,21 @@ TEST_P(ConvolveTest, FilterExtremes) {
                                        filters[filter_y], ref, kOutputStride,
                                        Width(), Height(), UUT_->use_highbd_);
             if (filter_x && filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters,
+                                filter_x, 16, filter_y, 16, Width(), Height()));
             else if (filter_y)
-              ASM_REGISTER_STATE_CHECK(UUT_->v8_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 16,
-                  filters[filter_y], 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0,
+                               16, filter_y, 16, Width(), Height()));
             else if (filter_x)
-              ASM_REGISTER_STATE_CHECK(UUT_->h8_[0](
-                  in, kInputStride, out, kOutputStride, filters[filter_x], 16,
-                  kInvalidFilter, 16, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(
+                  UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters,
+                               filter_x, 16, 0, 16, Width(), Height()));
             else
-              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](
-                  in, kInputStride, out, kOutputStride, kInvalidFilter, 0,
-                  kInvalidFilter, 0, Width(), Height()));
+              ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out,
+                                                      kOutputStride, NULL, 0, 0,
+                                                      0, 0, Width(), Height()));
 
             for (int y = 0; y < Height(); ++y) {
               for (int x = 0; x < Width(); ++x)
@@ -886,45 +994,63 @@ TEST_P(ConvolveTest, FilterExtremes) {
 
 /* This test exercises that enough rows and columns are filtered with every
    possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+                                                   vpx_scaled_avg_2d_c };
+
 TEST_P(ConvolveTest, CheckScalingFiltering) {
   uint8_t *const in = input();
   uint8_t *const out = output();
-  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+  uint8_t ref[kOutputStride * kMaxDimension];
 
-  SetConstantInput(127);
+  ::libvpx_test::ACMRandom prng;
+  for (int y = 0; y < Height(); ++y) {
+    for (int x = 0; x < Width(); ++x) {
+      const uint16_t r = prng.Rand8Extremes();
+      assign_val(in, y * kInputStride + x, r);
+    }
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+      for (int frac = 0; frac < 16; ++frac) {
+        for (int step = 1; step <= 32; ++step) {
+          /* Test the horizontal and vertical filters in combination. */
+          scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+                               frac, step, frac, step, Width(), Height());
+          ASM_REGISTER_STATE_CHECK(
+              UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+                             frac, step, frac, step, Width(), Height()));
 
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(
-          UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac],
-                         step, eighttap[frac], step, Width(), Height()));
-
-      CheckGuardBlocks();
-
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
+          CheckGuardBlocks();
+
+          for (int y = 0; y < Height(); ++y) {
+            for (int x = 0; x < Width(); ++x) {
+              ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+                        lookup(out, y * kOutputStride + x))
+                  << "x == " << x << ", y == " << y << ", frac == " << frac
+                  << ", step == " << step;
+            }
+          }
         }
       }
     }
   }
 }
+#endif
 
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define WRAP(func, bd)                                                         \
-  void wrap_##func##_##bd(                                                     \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride,      \
-      const int16_t *filter_y, int filter_y_stride, int w, int h) {            \
-    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,     \
-                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter_x, \
-                      filter_x_stride, filter_y, filter_y_stride, w, h, bd);   \
+#define WRAP(func, bd)                                                       \
+  void wrap_##func##_##bd(                                                   \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,           \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    vpx_highbd_##func(reinterpret_cast<const uint16_t *>(src), src_stride,   \
+                      reinterpret_cast<uint16_t *>(dst), dst_stride, filter, \
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);         \
   }
 
 #if HAVE_SSE2 && ARCH_X86_64
@@ -1161,8 +1287,8 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest,
 #else   // !CONFIG_VP9_HIGHBITDEPTH
 const ConvolveFunctions convolve8_avx2(
     vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_avx2,
-    vpx_convolve8_avg_horiz_ssse3, vpx_convolve8_vert_avx2,
-    vpx_convolve8_avg_vert_ssse3, vpx_convolve8_avx2, vpx_convolve8_avg_ssse3,
+    vpx_convolve8_avg_horiz_avx2, vpx_convolve8_vert_avx2,
+    vpx_convolve8_avg_vert_avx2, vpx_convolve8_avx2, vpx_convolve8_avg_avx2,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
     vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES(convolve8_avx2) };
@@ -1206,7 +1332,7 @@ const ConvolveFunctions convolve8_neon(
     vpx_convolve8_avg_horiz_neon, vpx_convolve8_vert_neon,
     vpx_convolve8_avg_vert_neon, vpx_convolve8_neon, vpx_convolve8_avg_neon,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_neon, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve_neon[] = { ALL_SIZES(convolve8_neon) };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1233,7 +1359,7 @@ const ConvolveFunctions convolve8_msa(
     vpx_convolve8_avg_horiz_msa, vpx_convolve8_vert_msa,
     vpx_convolve8_avg_vert_msa, vpx_convolve8_msa, vpx_convolve8_avg_msa,
     vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
-    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_msa, vpx_scaled_avg_2d_c, 0);
 
 const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES(convolve8_msa) };
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,
diff --git a/libvpx/test/datarate_test.cc b/libvpx/test/datarate_test.cc
index a120a88d2..31a8523d2 100644
--- a/libvpx/test/datarate_test.cc
+++ b/libvpx/test/datarate_test.cc
@@ -44,6 +44,7 @@ class DatarateTestLarge
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
     gf_boost_ = 0;
+    use_roi_ = 0;
   }
 
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -54,6 +55,12 @@ class DatarateTestLarge
       encoder->Control(VP8E_SET_GF_CBR_BOOST_PCT, gf_boost_);
     }
 
+#if CONFIG_VP8_ENCODER
+    if (use_roi_ == 1) {
+      encoder->Control(VP8E_SET_ROI_MAP, &roi_);
+    }
+#endif
+
     if (denoiser_offon_test_) {
       ASSERT_GT(denoiser_offon_period_, 0)
           << "denoiser_offon_period_ is not positive.";
@@ -91,8 +98,8 @@ class DatarateTestLarge
     const bool key_frame =
         (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
     if (!key_frame) {
-      ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                          << pkt->data.frame.pts;
+      ASSERT_GE(bits_in_buffer_model_, 0)
+          << "Buffer Underrun at frame " << pkt->data.frame.pts;
     }
 
     const int64_t frame_size_in_bits = pkt->data.frame.sz * 8;
@@ -145,6 +152,8 @@ class DatarateTestLarge
   int denoiser_offon_period_;
   int set_cpu_used_;
   int gf_boost_;
+  int use_roi_;
+  vpx_roi_map_t roi_;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
@@ -258,14 +267,6 @@ TEST_P(DatarateTestLarge, ChangingDropFrameThresh) {
   }
 }
 
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-#ifndef BUILDING_WITH_TSAN
 TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
   denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
@@ -285,7 +286,6 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
   ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
       << " The datarate for the file missed the target!";
 }
-#endif  // !BUILDING_WITH_TSAN
 
 class DatarateTestRealTime : public DatarateTestLarge {
  public:
@@ -402,10 +402,6 @@ TEST_P(DatarateTestRealTime, ChangingDropFrameThresh) {
   }
 }
 
-// Disabled for tsan, see:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1049
-
-#ifndef BUILDING_WITH_TSAN
 TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
   denoiser_on_ = 0;
   cfg_.rc_buf_initial_sz = 500;
@@ -426,7 +422,67 @@ TEST_P(DatarateTestRealTime, DropFramesMultiThreads) {
   ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
       << " The datarate for the file missed the target!";
 }
-#endif
+
+TEST_P(DatarateTestRealTime, RegionOfInterest) {
+  denoiser_on_ = 0;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.rc_end_usage = VPX_CBR;
+  // Encode using multiple threads.
+  cfg_.g_threads = 2;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  cfg_.rc_target_bitrate = 450;
+  cfg_.g_w = 352;
+  cfg_.g_h = 288;
+
+  ResetModel();
+
+  // Set ROI parameters
+  use_roi_ = 1;
+  memset(&roi_, 0, sizeof(roi_));
+
+  roi_.rows = (cfg_.g_h + 15) / 16;
+  roi_.cols = (cfg_.g_w + 15) / 16;
+
+  roi_.delta_q[0] = 0;
+  roi_.delta_q[1] = -20;
+  roi_.delta_q[2] = 0;
+  roi_.delta_q[3] = 0;
+
+  roi_.delta_lf[0] = 0;
+  roi_.delta_lf[1] = -20;
+  roi_.delta_lf[2] = 0;
+  roi_.delta_lf[3] = 0;
+
+  roi_.static_threshold[0] = 0;
+  roi_.static_threshold[1] = 1000;
+  roi_.static_threshold[2] = 0;
+  roi_.static_threshold[3] = 0;
+
+  // Use 2 states: 1 is center square, 0 is the rest.
+  roi_.roi_map =
+      (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+  for (unsigned int i = 0; i < roi_.rows; ++i) {
+    for (unsigned int j = 0; j < roi_.cols; ++j) {
+      if (i > (roi_.rows >> 2) && i < ((roi_.rows * 3) >> 2) &&
+          j > (roi_.cols >> 2) && j < ((roi_.cols * 3) >> 2)) {
+        roi_.roi_map[i * roi_.cols + j] = 1;
+      }
+    }
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.95)
+      << " The datarate for the file exceeds the target!";
+
+  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.4)
+      << " The datarate for the file missed the target!";
+
+  free(roi_.roi_map);
+}
 
 TEST_P(DatarateTestRealTime, GFBoost) {
   denoiser_on_ = 0;
@@ -482,6 +538,7 @@ class DatarateTestVP9Large
     }
     denoiser_offon_test_ = 0;
     denoiser_offon_period_ = -1;
+    frame_parallel_decoding_mode_ = 1;
   }
 
   //
@@ -561,6 +618,8 @@ class DatarateTestVP9Large
 
     encoder->Control(VP9E_SET_NOISE_SENSITIVITY, denoiser_on_);
     encoder->Control(VP9E_SET_TILE_COLUMNS, (cfg_.g_threads >> 1));
+    encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING,
+                     frame_parallel_decoding_mode_);
 
     if (cfg_.ts_number_layers > 1) {
       if (video->frame() == 0) {
@@ -599,8 +658,8 @@ class DatarateTestVP9Large
         duration * timebase_ * cfg_.rc_target_bitrate * 1000);
 
     // Buffer should not go negative.
-    ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-                                        << pkt->data.frame.pts;
+    ASSERT_GE(bits_in_buffer_model_, 0)
+        << "Buffer Underrun at frame " << pkt->data.frame.pts;
 
     const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
 
@@ -641,6 +700,7 @@ class DatarateTestVP9Large
   int denoiser_on_;
   int denoiser_offon_test_;
   int denoiser_offon_period_;
+  int frame_parallel_decoding_mode_;
 };
 
 // Check basic rate targeting for VBR mode with 0 lag.
@@ -659,7 +719,7 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagZero) {
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -686,7 +746,37 @@ TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZero) {
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
     ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
         << " The datarate for the file is lower than target by too much!";
-    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.25)
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
+// Check basic rate targeting for VBR mode with non-zero lag, with
+// frame_parallel_decoding_mode off. This enables the adapt_coeff/mode/mv probs
+// since error_resilience is off.
+TEST_P(DatarateTestVP9Large, BasicRateTargetingVBRLagNonZeroFrameParDecOff) {
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_error_resilient = 0;
+  cfg_.rc_end_usage = VPX_VBR;
+  // For non-zero lag, rate control will work (be within bounds) for
+  // real-time mode.
+  if (deadline_ == VPX_DL_REALTIME) {
+    cfg_.g_lag_in_frames = 15;
+  } else {
+    cfg_.g_lag_in_frames = 0;
+  }
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 300);
+  for (int i = 400; i <= 800; i += 400) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    frame_parallel_decoding_mode_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.75)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.30)
         << " The datarate for the file is greater than target by too much!";
   }
 }
@@ -715,6 +805,33 @@ TEST_P(DatarateTestVP9Large, BasicRateTargeting) {
   }
 }
 
+// Check basic rate targeting for CBR mode, with frame_parallel_decoding_mode
+// off( and error_resilience off).
+TEST_P(DatarateTestVP9Large, BasicRateTargetingFrameParDecOff) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 1;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_error_resilient = 0;
+
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
+                                       30, 1, 0, 140);
+  for (int i = 150; i < 800; i += 200) {
+    cfg_.rc_target_bitrate = i;
+    ResetModel();
+    frame_parallel_decoding_mode_ = 0;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    ASSERT_GE(effective_datarate_[0], cfg_.rc_target_bitrate * 0.85)
+        << " The datarate for the file is lower than target by too much!";
+    ASSERT_LE(effective_datarate_[0], cfg_.rc_target_bitrate * 1.15)
+        << " The datarate for the file is greater than target by too much!";
+  }
+}
+
 // Check basic rate targeting for CBR mode, with 2 threads and dropped frames.
 TEST_P(DatarateTestVP9Large, BasicRateTargetingDropFramesMultiThreads) {
   cfg_.rc_buf_initial_sz = 500;
@@ -1099,16 +1216,17 @@ class DatarateOnePassCbrSvc
   }
   virtual void ResetModel() {
     last_pts_ = 0;
-    bits_in_buffer_model_ = cfg_.rc_target_bitrate * cfg_.rc_buf_initial_sz;
-    frame_number_ = 0;
-    first_drop_ = 0;
-    bits_total_ = 0;
     duration_ = 0.0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
     denoiser_on_ = 0;
     tune_content_ = 0;
     base_speed_setting_ = 5;
+    spatial_layer_id_ = 0;
+    temporal_layer_id_ = 0;
+    memset(bits_in_buffer_model_, 0, sizeof(bits_in_buffer_model_));
+    memset(bits_total_, 0, sizeof(bits_total_));
+    memset(layer_target_avg_bandwidth_, 0, sizeof(layer_target_avg_bandwidth_));
   }
   virtual void BeginPassHook(unsigned int /*pass*/) {}
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
@@ -1139,32 +1257,94 @@ class DatarateOnePassCbrSvc
     timebase_ = static_cast<double>(tb.num) / tb.den;
     duration_ = 0;
   }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    vpx_svc_layer_id_t layer_id;
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    spatial_layer_id_ = layer_id.spatial_layer_id;
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+    // Update buffer with per-layer target frame bandwidth, this is done
+    // for every frame passed to the encoder (encoded or dropped).
+    // For temporal layers, update the cumulative buffer level.
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_in_buffer_model_[layer] +=
+            static_cast<int64_t>(layer_target_avg_bandwidth_[layer]);
+      }
+    }
+  }
+
+  vpx_codec_err_t parse_superframe_index(const uint8_t *data, size_t data_sz,
+                                         uint32_t sizes[8], int *count) {
+    uint8_t marker;
+    marker = *(data + data_sz - 1);
+    *count = 0;
+    if ((marker & 0xe0) == 0xc0) {
+      const uint32_t frames = (marker & 0x7) + 1;
+      const uint32_t mag = ((marker >> 3) & 0x3) + 1;
+      const size_t index_sz = 2 + mag * frames;
+      // This chunk is marked as having a superframe index but doesn't have
+      // enough data for it, thus it's an invalid superframe index.
+      if (data_sz < index_sz) return VPX_CODEC_CORRUPT_FRAME;
+      {
+        const uint8_t marker2 = *(data + data_sz - index_sz);
+        // This chunk is marked as having a superframe index but doesn't have
+        // the matching marker byte at the front of the index therefore it's an
+        // invalid chunk.
+        if (marker != marker2) return VPX_CODEC_CORRUPT_FRAME;
+      }
+      {
+        uint32_t i, j;
+        const uint8_t *x = &data[data_sz - index_sz + 1];
+        for (i = 0; i < frames; ++i) {
+          uint32_t this_sz = 0;
+
+          for (j = 0; j < mag; ++j) this_sz |= (*x++) << (j * 8);
+          sizes[i] = this_sz;
+        }
+        *count = frames;
+      }
+    }
+    return VPX_CODEC_OK;
+  }
+
   virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
-    if (last_pts_ == 0) duration = 1;
-    bits_in_buffer_model_ += static_cast<int64_t>(
-        duration * timebase_ * cfg_.rc_target_bitrate * 1000);
+    uint32_t sizes[8] = { 0 };
+    int count = 0;
+    last_pts_ = pkt->data.frame.pts;
     const bool key_frame =
         (pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? true : false;
-    if (!key_frame) {
-      // TODO(marpan): This check currently fails for some of the SVC tests,
-      // re-enable when issue (webm:1350) is resolved.
-      //  ASSERT_GE(bits_in_buffer_model_, 0) << "Buffer Underrun at frame "
-      //                                      << pkt->data.frame.pts;
+    parse_superframe_index(static_cast<const uint8_t *>(pkt->data.frame.buf),
+                           pkt->data.frame.sz, sizes, &count);
+    ASSERT_EQ(count, number_spatial_layers_);
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      sizes[sl] = sizes[sl] << 3;
+      // Update the total encoded bits per layer.
+      // For temporal layers, update the cumulative encoded bits per layer.
+      for (int tl = temporal_layer_id_; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        bits_total_[layer] += static_cast<int64_t>(sizes[sl]);
+        // Update the per-layer buffer level with the encoded frame size.
+        bits_in_buffer_model_[layer] -= static_cast<int64_t>(sizes[sl]);
+        // There should be no buffer underrun, except on the base
+        // temporal layer, since there may be key frames there.
+        if (!key_frame && tl > 0) {
+          ASSERT_GE(bits_in_buffer_model_[layer], 0)
+              << "Buffer Underrun at frame " << pkt->data.frame.pts;
+        }
+      }
     }
-    const size_t frame_size_in_bits = pkt->data.frame.sz * 8;
-    bits_in_buffer_model_ -= static_cast<int64_t>(frame_size_in_bits);
-    bits_total_ += frame_size_in_bits;
-    if (!first_drop_ && duration > 1) first_drop_ = last_pts_ + 1;
-    last_pts_ = pkt->data.frame.pts;
-    bits_in_last_frame_ = frame_size_in_bits;
-    ++frame_number_;
   }
+
   virtual void EndPassHook(void) {
-    if (bits_total_) {
-      const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
-      duration_ = (last_pts_ + 1) * timebase_;
-      file_datarate_ = file_size_in_kb / duration_;
+    for (int sl = 0; sl < number_spatial_layers_; ++sl) {
+      for (int tl = 0; tl < number_temporal_layers_; ++tl) {
+        const int layer = sl * number_temporal_layers_ + tl;
+        const double file_size_in_kb = bits_total_[layer] / 1000.;
+        duration_ = (last_pts_ + 1) * timebase_;
+        file_datarate_[layer] = file_size_in_kb / duration_;
+      }
     }
   }
 
@@ -1177,13 +1357,11 @@ class DatarateOnePassCbrSvc
   unsigned int GetMismatchFrames() { return mismatch_nframes_; }
 
   vpx_codec_pts_t last_pts_;
-  int64_t bits_in_buffer_model_;
+  int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
   double timebase_;
-  int frame_number_;
-  vpx_codec_pts_t first_drop_;
-  int64_t bits_total_;
+  int64_t bits_total_[VPX_MAX_LAYERS];
   double duration_;
-  double file_datarate_;
+  double file_datarate_[VPX_MAX_LAYERS];
   size_t bits_in_last_frame_;
   vpx_svc_extra_cfg_t svc_params_;
   int speed_setting_;
@@ -1192,14 +1370,22 @@ class DatarateOnePassCbrSvc
   int denoiser_on_;
   int tune_content_;
   int base_speed_setting_;
+  int spatial_layer_id_;
+  int temporal_layer_id_;
+  int number_spatial_layers_;
+  int number_temporal_layers_;
+  int layer_target_avg_bandwidth_[VPX_MAX_LAYERS];
 };
 static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
                                   const vpx_svc_extra_cfg_t *svc_params,
                                   int spatial_layers, int temporal_layers,
-                                  int temporal_layering_mode) {
+                                  int temporal_layering_mode,
+                                  int *layer_target_avg_bandwidth,
+                                  int64_t *bits_in_buffer_model) {
   int sl, spatial_layer_target;
   float total = 0;
   float alloc_ratio[VPX_MAX_LAYERS] = { 0 };
+  float framerate = 30.0;
   for (sl = 0; sl < spatial_layers; ++sl) {
     if (svc_params->scaling_factor_den[sl] > 0) {
       alloc_ratio[sl] = (float)(svc_params->scaling_factor_num[sl] * 1.0 /
@@ -1219,10 +1405,43 @@ static void assign_layer_bitrates(vpx_codec_enc_cfg_t *const enc_cfg,
     } else if (temporal_layering_mode == 2) {
       enc_cfg->layer_target_bitrate[index] = spatial_layer_target * 2 / 3;
       enc_cfg->layer_target_bitrate[index + 1] = spatial_layer_target;
+    } else if (temporal_layering_mode <= 1) {
+      enc_cfg->layer_target_bitrate[index] = spatial_layer_target;
+    }
+  }
+  for (sl = 0; sl < spatial_layers; ++sl) {
+    for (int tl = 0; tl < temporal_layers; ++tl) {
+      const int layer = sl * temporal_layers + tl;
+      float layer_framerate = framerate;
+      if (temporal_layers == 2 && tl == 0) layer_framerate = framerate / 2;
+      if (temporal_layers == 3 && tl == 0) layer_framerate = framerate / 4;
+      if (temporal_layers == 3 && tl == 1) layer_framerate = framerate / 2;
+      layer_target_avg_bandwidth[layer] = static_cast<int>(
+          enc_cfg->layer_target_bitrate[layer] * 1000.0 / layer_framerate);
+      bits_in_buffer_model[layer] =
+          enc_cfg->layer_target_bitrate[layer] * enc_cfg->rc_buf_initial_sz;
     }
   }
 }
 
+static void CheckLayerRateTargeting(vpx_codec_enc_cfg_t *const cfg,
+                                    int number_spatial_layers,
+                                    int number_temporal_layers,
+                                    double *file_datarate,
+                                    double thresh_overshoot,
+                                    double thresh_undershoot) {
+  for (int sl = 0; sl < number_spatial_layers; ++sl)
+    for (int tl = 0; tl < number_temporal_layers; ++tl) {
+      const int layer = sl * number_temporal_layers + tl;
+      ASSERT_GE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_overshoot)
+          << " The datarate for the file exceeds the target by too much!";
+      ASSERT_LE(cfg->layer_target_bitrate[layer],
+                file_datarate[layer] * thresh_undershoot)
+          << " The datarate for the file is lower than the target by too much!";
+    }
+}
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
 // temporal layer, with screen content mode on and same speed setting for all
 // layers.
@@ -1246,14 +1465,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TLScreenContent1) {
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 500;
   ResetModel();
   tune_content_ = 1;
   base_speed_setting_ = speed_setting_;
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
@@ -1281,26 +1505,28 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL) {
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
   for (int i = 200; i <= 800; i += 200) {
     cfg_.rc_target_bitrate = i;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
 #if CONFIG_VP9_DECODER
     // Number of temporal layers > 1, so half of the frames in this SVC pattern
     // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expcet 100 (half of the sequence)
+    // Since frame dropper is off, we can expect 200 (half of the sequence)
     // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(100), GetMismatchFrames());
+    EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
   }
 }
@@ -1329,33 +1555,41 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLDenoiserOn) {
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // TODO(marpan): Check that effective_datarate for each layer hits the
   // layer target_bitrate.
-  for (int i = 600; i <= 1000; i += 200) {
-    cfg_.rc_target_bitrate = i;
-    ResetModel();
-    denoiser_on_ = 1;
-    assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
-    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+  // For SVC, noise_sen = 1 means denoising only the top spatial layer
+  // noise_sen = 2 means denoising the two top spatial layers.
+  for (int noise_sen = 1; noise_sen <= 2; noise_sen++) {
+    for (int i = 600; i <= 1000; i += 200) {
+      cfg_.rc_target_bitrate = i;
+      ResetModel();
+      denoiser_on_ = noise_sen;
+      assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
+                            cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                            layer_target_avg_bandwidth_, bits_in_buffer_model_);
+      ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+      CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                              number_temporal_layers_, file_datarate_, 0.78,
+                              1.15);
 #if CONFIG_VP9_DECODER
-    // Number of temporal layers > 1, so half of the frames in this SVC pattern
-    // will be non-reference frame and hence encoder will avoid loopfilter.
-    // Since frame dropper is off, we can expcet 150 (half of the sequence)
-    // mismatched frames.
-    EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+      // Number of temporal layers > 1, so half of the frames in this SVC
+      // pattern
+      // will be non-reference frame and hence encoder will avoid loopfilter.
+      // Since frame dropper is off, we can expect 200 (half of the sequence)
+      // mismatched frames.
+      EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
+    }
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
 // temporal layers. Run CIF clip with 1 thread, and few short key frame periods.
-TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TLSmallKf) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1376,27 +1610,29 @@ TEST_P(DatarateOnePassCbrSvc, DISABLED_OnePassCbrSvc2SL3TLSmallKf) {
   svc_params_.scaling_factor_num[1] = 288;
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                       30, 1, 0, 200);
   cfg_.rc_target_bitrate = 400;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 64; j <= 67; j++) {
     cfg_.kf_max_dist = j;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
   }
 }
 
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and
 // 3 temporal layers. Run HD clip with 4 threads.
-TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
+TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4Threads) {
   cfg_.rc_buf_initial_sz = 500;
   cfg_.rc_buf_optimal_sz = 500;
   cfg_.rc_buf_sz = 1000;
@@ -1418,22 +1654,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL3TL4threads) {
   svc_params_.scaling_factor_den[1] = 288;
   cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.15)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
   // Number of temporal layers > 1, so half of the frames in this SVC pattern
   // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
   // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
 #endif
 }
 
@@ -1463,22 +1700,24 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL) {
   svc_params_.scaling_factor_den[2] = 288;
   cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
   // Number of temporal layers > 1, so half of the frames in this SVC pattern
   // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 200 (half of the sequence)
   // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(200), GetMismatchFrames());
 #endif
 }
 
@@ -1507,20 +1746,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TLSmallKf) {
   svc_params_.scaling_factor_num[2] = 288;
   svc_params_.scaling_factor_den[2] = 288;
   cfg_.rc_dropframe_thresh = 10;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
   cfg_.rc_target_bitrate = 800;
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
   // For this 3 temporal layer case, pattern repeats every 4 frames, so choose
   // 4 key neighboring key frame periods (so key frame will land on 0-2-1-2).
   for (int j = 32; j <= 35; j++) {
     cfg_.kf_max_dist = j;
     ResetModel();
     assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                          cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                          cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                          layer_target_avg_bandwidth_, bits_in_buffer_model_);
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.80)
-        << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.30)
-        << " The datarate for the file is lower than the target by too much!";
+    CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                            number_temporal_layers_, file_datarate_, 0.78,
+                            1.15);
   }
 }
 
@@ -1550,22 +1792,23 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc3SL3TL4threads) {
   svc_params_.scaling_factor_den[2] = 288;
   cfg_.rc_dropframe_thresh = 0;
   cfg_.kf_max_dist = 9999;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 800;
   ResetModel();
   assign_layer_bitrates(&cfg_, &svc_params_, cfg_.ss_number_layers,
-                        cfg_.ts_number_layers, cfg_.temporal_layering_mode);
+                        cfg_.ts_number_layers, cfg_.temporal_layering_mode,
+                        layer_target_avg_bandwidth_, bits_in_buffer_model_);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_GE(cfg_.rc_target_bitrate, file_datarate_ * 0.78)
-      << " The datarate for the file exceeds the target by too much!";
-  ASSERT_LE(cfg_.rc_target_bitrate, file_datarate_ * 1.22)
-      << " The datarate for the file is lower than the target by too much!";
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
 #if CONFIG_VP9_DECODER
   // Number of temporal layers > 1, so half of the frames in this SVC pattern
   // will be non-reference frame and hence encoder will avoid loopfilter.
-  // Since frame dropper is off, we can expcet 150 (half of the sequence)
+  // Since frame dropper is off, we can expect 30 (half of the sequence)
   // mismatched frames.
-  EXPECT_EQ(static_cast<unsigned int>(150), GetMismatchFrames());
+  EXPECT_EQ(static_cast<unsigned int>(30), GetMismatchFrames());
 #endif
 }
 
@@ -1597,9 +1840,19 @@ TEST_P(DatarateOnePassCbrSvc, OnePassCbrSvc2SL1TL5x5MultipleRuns) {
   cfg_.layer_target_bitrate[0] = 300;
   cfg_.layer_target_bitrate[1] = 1400;
   cfg_.rc_target_bitrate = 1700;
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 300);
+  number_spatial_layers_ = cfg_.ss_number_layers;
+  number_temporal_layers_ = cfg_.ts_number_layers;
   ResetModel();
+  layer_target_avg_bandwidth_[0] = cfg_.layer_target_bitrate[0] * 1000 / 30;
+  bits_in_buffer_model_[0] =
+      cfg_.layer_target_bitrate[0] * cfg_.rc_buf_initial_sz;
+  layer_target_avg_bandwidth_[1] = cfg_.layer_target_bitrate[1] * 1000 / 30;
+  bits_in_buffer_model_[1] =
+      cfg_.layer_target_bitrate[1] * cfg_.rc_buf_initial_sz;
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(&cfg_, number_spatial_layers_,
+                          number_temporal_layers_, file_datarate_, 0.78, 1.15);
   EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
 }
 
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 6ea77fde2..ce0bd37b3 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -542,8 +542,8 @@ class Trans16x16TestBase {
         const uint32_t diff = dst[j] - src[j];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u, error) << "Error: 16x16 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -744,66 +744,6 @@ TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
 
-class PartialTrans16x16Test : public ::testing::TestWithParam<
-                                  std::tr1::tuple<FdctFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans16x16Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FdctFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans16x16Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 1, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ((minval * kNumCoeffs) >> 1, output[0]);
-}
-
-TEST_P(PartialTrans16x16Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 16));
-  EXPECT_EQ(sum >> 1, output[0]);
-}
-
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -836,11 +776,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans16x16Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct16x16_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
@@ -849,17 +784,14 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans16x16DCT,
-    ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon,
-                                 0, VPX_BITS_8)));
-#endif
+    ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -876,9 +808,6 @@ INSTANTIATE_TEST_CASE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -913,9 +842,6 @@ INSTANTIATE_TEST_CASE_P(
                                  &idct16x16_10_add_12_sse2, 3167, VPX_BITS_12),
                       make_tuple(&idct16x16_12, &idct16x16_256_add_12_sse2,
                                  3167, VPX_BITS_12)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -931,8 +857,12 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_msa, &vp9_iht16x16_256_add_msa, 3,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans16x16Test,
-                        ::testing::Values(make_tuple(&vpx_fdct16x16_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, Trans16x16DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct16x16_c,
+                                                     &vpx_idct16x16_256_add_vsx,
+                                                     0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc
index d8054c4eb..a95ff9732 100644
--- a/libvpx/test/dct32x32_test.cc
+++ b/libvpx/test/dct32x32_test.cc
@@ -292,67 +292,6 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
-class PartialTrans32x32Test
-    : public ::testing::TestWithParam<
-          std::tr1::tuple<FwdTxfmFunc, vpx_bit_depth_t> > {
- public:
-  virtual ~PartialTrans32x32Test() {}
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    bit_depth_ = GET_PARAM(1);
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  vpx_bit_depth_t bit_depth_;
-  FwdTxfmFunc fwd_txfm_;
-};
-
-TEST_P(PartialTrans32x32Test, Extremes) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  const int minval = -maxval;
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = maxval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((maxval * kNumCoeffs) >> 3, output[0]);
-
-  for (int i = 0; i < kNumCoeffs; ++i) input[i] = minval;
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ((minval * kNumCoeffs) >> 3, output[0]);
-}
-
-TEST_P(PartialTrans32x32Test, Random) {
-#if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t maxval =
-      static_cast<int16_t>(clip_pixel_highbd(1 << 30, bit_depth_));
-#else
-  const int16_t maxval = 255;
-#endif
-  DECLARE_ALIGNED(16, int16_t, input[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output[kNumCoeffs]);
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int sum = 0;
-  for (int i = 0; i < kNumCoeffs; ++i) {
-    const int val = (i & 1) ? -rnd(maxval + 1) : rnd(maxval + 1);
-    input[i] = val;
-    sum += val;
-  }
-  output[0] = 0;
-  ASM_REGISTER_STATE_CHECK(fwd_txfm_(input, output, 32));
-  EXPECT_EQ(sum >> 3, output[0]);
-}
-
 using std::tr1::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -366,11 +305,6 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 0, VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    C, PartialTrans32x32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_fdct32x32_1_c, VPX_BITS_12)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans32x32Test,
@@ -378,19 +312,16 @@ INSTANTIATE_TEST_CASE_P(
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(C, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_c,
-                                                     VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans32x32Test,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_neon,
-                                 0, VPX_BITS_8),
-                      make_tuple(&vpx_fdct32x32_rd_c,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_neon,
                                  &vpx_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
-#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
@@ -399,9 +330,6 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -418,9 +346,6 @@ INSTANTIATE_TEST_CASE_P(
                    VPX_BITS_8),
         make_tuple(&vpx_fdct32x32_rd_sse2, &vpx_idct32x32_1024_add_c, 1,
                    VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(SSE2, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -439,8 +364,14 @@ INSTANTIATE_TEST_CASE_P(
                                  &vpx_idct32x32_1024_add_msa, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_msa,
                                  &vpx_idct32x32_1024_add_msa, 1, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(MSA, PartialTrans32x32Test,
-                        ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa,
-                                                     VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    VSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_vsx,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_c,
+                                 &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/dct_partial_test.cc b/libvpx/test/dct_partial_test.cc
new file mode 100644
index 000000000..4d145f589
--- /dev/null
+++ b/libvpx/test/dct_partial_test.cc
@@ -0,0 +1,169 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*PartialFdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+
+typedef tuple<PartialFdctFunc, int /* size */, vpx_bit_depth_t>
+    PartialFdctParam;
+
+tran_low_t partial_fdct_ref(const Buffer<int16_t> &in, int size) {
+  int64_t sum = 0;
+  for (int y = 0; y < size; ++y) {
+    for (int x = 0; x < size; ++x) {
+      sum += in.TopLeftPixel()[y * in.stride() + x];
+    }
+  }
+
+  switch (size) {
+    case 4: sum *= 2; break;
+    case 8: /*sum = sum;*/ break;
+    case 16: sum >>= 1; break;
+    case 32: sum >>= 3; break;
+  }
+
+  return static_cast<tran_low_t>(sum);
+}
+
+class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
+ public:
+  PartialFdctTest() {
+    fwd_txfm_ = GET_PARAM(0);
+    size_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int16_t maxvalue =
+        clip_pixel_highbd(std::numeric_limits<int16_t>::max(), bit_depth_);
+    const int16_t minvalue = -maxvalue;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < 100; ++i) {
+      if (i == 0) {
+        input_block.Set(maxvalue);
+      } else if (i == 1) {
+        input_block.Set(minvalue);
+      } else {
+        input_block.Set(&rnd, minvalue, maxvalue);
+      }
+
+      ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block.TopLeftPixel(),
+                                         output_block.TopLeftPixel(),
+                                         input_block.stride()));
+
+      EXPECT_EQ(partial_fdct_ref(input_block, size_),
+                output_block.TopLeftPixel()[0]);
+    }
+  }
+
+  PartialFdctFunc fwd_txfm_;
+  vpx_bit_depth_t bit_depth_;
+  int size_;
+};
+
+TEST_P(PartialFdctTest, PartialFdctTest) { RunTest(); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_c, 32, VPX_BITS_10),
+                      make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_c, 16, VPX_BITS_10),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct8x8_1_c, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_c, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_c, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_c, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_c, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_sse2, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_sse2, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_sse2, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_sse2, 4, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+#if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    NEON, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(MSA, PartialFdctTest,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_1_msa, 8,
+                                                     VPX_BITS_8)));
+#else   // !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    MSA, PartialFdctTest,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_1_msa, 32, VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_1_msa, 16, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_1_msa, 8, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+}  // namespace
diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc
new file mode 100644
index 000000000..addbdfb46
--- /dev/null
+++ b/libvpx/test/dct_test.cc
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/buffer.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+
+namespace {
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+typedef void (*FhtFuncRef)(const Buffer<int16_t> &in, Buffer<tran_low_t> *out,
+                           int size, int tx_type);
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+/* forward transform, inverse transform, size, transform type, bit depth */
+typedef tuple<FdctFunc, IdctFunc, int, int, vpx_bit_depth_t> DctParam;
+typedef tuple<FhtFunc, IhtFunc, int, int, vpx_bit_depth_t> HtParam;
+
+void fdct_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vpx_fdct4x4_c(i, o, i_stride);
+  } else if (size == 8) {
+    vpx_fdct8x8_c(i, o, i_stride);
+  } else if (size == 16) {
+    vpx_fdct16x16_c(i, o, i_stride);
+  } else if (size == 32) {
+    vpx_fdct32x32_c(i, o, i_stride);
+  }
+}
+
+void fht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+             int tx_type) {
+  const int16_t *i = in.TopLeftPixel();
+  const int i_stride = in.stride();
+  tran_low_t *o = out->TopLeftPixel();
+  if (size == 4) {
+    vp9_fht4x4_c(i, o, i_stride, tx_type);
+  } else if (size == 8) {
+    vp9_fht8x8_c(i, o, i_stride, tx_type);
+  } else if (size == 16) {
+    vp9_fht16x16_c(i, o, i_stride, tx_type);
+  }
+}
+
+void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
+              int /*tx_type*/) {
+  ASSERT_EQ(size, 4);
+  vp9_fwht4x4_c(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#define idctNxN(n, coeffs, bitdepth)                                       \
+  void idct##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,      \
+                                  int stride) {                            \
+    vpx_highbd_idct##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out), \
+                                                stride, bitdepth);         \
+  }
+
+idctNxN(4, 16, 10);
+idctNxN(4, 16, 12);
+idctNxN(8, 64, 10);
+idctNxN(8, 64, 12);
+idctNxN(16, 256, 10);
+idctNxN(16, 256, 12);
+idctNxN(32, 1024, 10);
+idctNxN(32, 1024, 12);
+
+#define ihtNxN(n, coeffs, bitdepth)                                        \
+  void iht##n##x##n##_##bitdepth(const tran_low_t *in, uint8_t *out,       \
+                                 int stride, int tx_type) {                \
+    vp9_highbd_iht##n##x##n##_##coeffs##_add_c(in, CAST_TO_SHORTPTR(out),  \
+                                               stride, tx_type, bitdepth); \
+  }
+
+ihtNxN(4, 16, 10);
+ihtNxN(4, 16, 12);
+ihtNxN(8, 64, 10);
+ihtNxN(8, 64, 12);
+ihtNxN(16, 256, 10);
+// ihtNxN(16, 256, 12);
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class TransTestBase {
+ public:
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  virtual void RunFwdTxfm(const Buffer<int16_t> &in,
+                          Buffer<tran_low_t> *out) = 0;
+
+  virtual void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Buffer<int16_t> test_input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(test_input_block.Init());
+    Buffer<tran_low_t> test_temp_block =
+        Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(test_temp_block.Init());
+    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst.Init());
+    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(src.Init());
+#if CONFIG_VP9_HIGHBITDEPTH
+    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst16.Init());
+    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(src16.Init());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      if (bit_depth_ == 8) {
+        src.Set(&rnd, &ACMRandom::Rand8);
+        dst.Set(&rnd, &ACMRandom::Rand8);
+        // Initialize a test block with input range [-255, 255].
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src.TopLeftPixel()[h * src.stride() + w] -
+                dst.TopLeftPixel()[h * dst.stride() + w];
+          }
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16.Set(&rnd, 0, max_pixel_value_);
+        dst16.Set(&rnd, 0, max_pixel_value_);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            test_input_block.TopLeftPixel()[h * test_input_block.stride() + w] =
+                src16.TopLeftPixel()[h * src16.stride() + w] -
+                dst16.TopLeftPixel()[h * dst16.stride() + w];
+          }
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block, &test_temp_block));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (bit_depth_ != 8) {
+            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+                   src16.TopLeftPixel()[h * src16.stride() + w];
+          } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+                   src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          const uint32_t error = diff * diff;
+          if (max_error < error) max_error = error;
+          total_error += error;
+        }
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      input_block.Set(&rnd, -max_pixel_value_, max_pixel_value_);
+
+      fwd_txfm_ref(input_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      if (::testing::Test::HasFailure()) {
+        printf("Size: %d Transform type: %d\n", size_, tx_type_);
+        output_block.PrintDifference(output_ref_block);
+        return;
+      }
+    }
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    Buffer<int16_t> input_extreme_block =
+        Buffer<int16_t>(size_, size_, 8, size_ == 4 ? 0 : 16);
+    ASSERT_TRUE(input_extreme_block.Init());
+    Buffer<tran_low_t> output_ref_block = Buffer<tran_low_t>(size_, size_, 0);
+    ASSERT_TRUE(output_ref_block.Init());
+    Buffer<tran_low_t> output_block = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(output_block.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with -max_pixel_value_ or max_pixel_value_.
+      if (i == 0) {
+        input_extreme_block.Set(max_pixel_value_);
+      } else if (i == 1) {
+        input_extreme_block.Set(-max_pixel_value_);
+      } else {
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            input_extreme_block
+                .TopLeftPixel()[h * input_extreme_block.stride() + w] =
+                rnd.Rand8() % 2 ? max_pixel_value_ : -max_pixel_value_;
+          }
+        }
+      }
+
+      fwd_txfm_ref(input_extreme_block, &output_ref_block, size_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block, &output_block));
+
+      // The minimum quant value is 4.
+      EXPECT_TRUE(output_block.CheckValues(output_ref_block));
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          EXPECT_GE(
+              4 * DCT_MAX_VALUE << (bit_depth_ - 8),
+              abs(output_block.TopLeftPixel()[h * output_block.stride() + w]))
+              << "Error: 4x4 FDCT has coefficient larger than "
+                 "4*DCT_MAX_VALUE"
+              << " at " << w << "," << h;
+          if (::testing::Test::HasFailure()) {
+            printf("Size: %d Transform type: %d\n", size_, tx_type_);
+            output_block.DumpBuffer();
+            return;
+          }
+        }
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    Buffer<int16_t> in = Buffer<int16_t>(size_, size_, 4);
+    ASSERT_TRUE(in.Init());
+    Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(coeff.Init());
+    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst.Init());
+    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
+    ASSERT_TRUE(src.Init());
+    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
+    ASSERT_TRUE(dst16.Init());
+    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
+    ASSERT_TRUE(src16.Init());
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-max_pixel_value_,
+      // max_pixel_value_].
+      if (bit_depth_ == VPX_BITS_8) {
+        src.Set(&rnd, &ACMRandom::Rand8);
+        dst.Set(&rnd, &ACMRandom::Rand8);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src.TopLeftPixel()[h * src.stride() + w] -
+                dst.TopLeftPixel()[h * dst.stride() + w];
+          }
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16.Set(&rnd, 0, max_pixel_value_);
+        dst16.Set(&rnd, 0, max_pixel_value_);
+        for (int h = 0; h < size_; ++h) {
+          for (int w = 0; w < size_; ++w) {
+            in.TopLeftPixel()[h * in.stride() + w] =
+                src16.TopLeftPixel()[h * src16.stride() + w] -
+                dst16.TopLeftPixel()[h * dst16.stride() + w];
+          }
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      fwd_txfm_ref(in, &coeff, size_, tx_type_);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst.TopLeftPixel()));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16.TopLeftPixel())));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int h = 0; h < size_; ++h) {
+        for (int w = 0; w < size_; ++w) {
+          int diff;
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (bit_depth_ != 8) {
+            diff = dst16.TopLeftPixel()[h * dst16.stride() + w] -
+                   src16.TopLeftPixel()[h * src16.stride() + w];
+          } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            diff = dst.TopLeftPixel()[h * dst.stride() + w] -
+                   src.TopLeftPixel()[h * src.stride() + w];
+#if CONFIG_VP9_HIGHBITDEPTH
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          const uint32_t error = diff * diff;
+          EXPECT_GE(static_cast<uint32_t>(limit), error)
+              << "Error: " << size_ << "x" << size_ << " IDCT has error "
+              << error << " at " << w << "," << h;
+        }
+      }
+    }
+  }
+
+  FhtFuncRef fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int tx_type_;
+  int max_pixel_value_;
+  int size_;
+};
+
+class TransDCT : public TransTestBase,
+                 public ::testing::TestWithParam<DctParam> {
+ public:
+  TransDCT() {
+    fwd_txfm_ref = fdct_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride());
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(TransDCT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransDCT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransDCT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransDCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_10, 32, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct32x32_c, &idct32x32_12, 32, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_10, 16, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_c, &idct16x16_12, 16, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_c, &idct8x8_12, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_c, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_c, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+#if !CONFIG_EMULATE_HARDWARE
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+   make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_12, 32, 0, VPX_BITS_12),
+   make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_12, 16, 0, VPX_BITS_12),
+*/
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct32x32_sse2, &idct32x32_10, 32, 0,
+                   VPX_BITS_10),
+        make_tuple(&vpx_fdct32x32_sse2, &vpx_idct32x32_1024_add_sse2, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct16x16_sse2, &idct16x16_10, 16, 0,
+                   VPX_BITS_10),
+        make_tuple(&vpx_fdct16x16_sse2, &vpx_idct16x16_256_add_sse2, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct8x8_sse2, &idct8x8_12, 8, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4, 0,
+                   VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_sse2,
+                                 &vpx_idct32x32_1024_add_sse2, 32, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_sse2,
+                                 &vpx_idct16x16_256_add_sse2, 16, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_sse2, &vpx_idct8x8_64_add_sse2, 8,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_sse2, 4,
+                                 0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSE2
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if !ARCH_X86_64
+// TODO(johannkoenig): high bit depth fdct8x8.
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_c, &vpx_idct8x8_64_add_sse2, 8, 0,
+                                 VPX_BITS_8)));
+#else
+// vpx_fdct8x8_ssse3 is only available in 64 bit builds.
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_c, &vpx_idct32x32_1024_add_sse2,
+                                 32, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_ssse3, &vpx_idct8x8_64_add_sse2,
+                                 8, 0, VPX_BITS_8)));
+#endif  // !ARCH_X86_64
+#endif  // HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+#if !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+// TODO(johannkoenig): high bit depth fdct32x32.
+INSTANTIATE_TEST_CASE_P(
+    AVX2, TransDCT, ::testing::Values(make_tuple(&vpx_fdct32x32_avx2,
+                                                 &vpx_idct32x32_1024_add_sse2,
+                                                 32, 0, VPX_BITS_8)));
+
+#endif  // !CONFIG_VP9_HIGHBITDEPTH && HAVE_AVX2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    NEON, TransDCT,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_neon,
+                                 &vpx_idct32x32_1024_add_neon, 32, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct16x16_neon,
+                                 &vpx_idct16x16_256_add_neon, 16, 0,
+                                 VPX_BITS_8),
+                      make_tuple(&vpx_fdct8x8_neon, &vpx_idct8x8_64_add_neon, 8,
+                                 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_neon, &vpx_idct4x4_16_add_neon, 4,
+                                 0, VPX_BITS_8)));
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON
+
+#if HAVE_MSA
+#if !CONFIG_VP9_HIGHBITDEPTH
+#if !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(
+    MSA, TransDCT,
+    ::testing::Values(
+        make_tuple(&vpx_fdct32x32_msa, &vpx_idct32x32_1024_add_msa, 32, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct16x16_msa, &vpx_idct16x16_256_add_msa, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vpx_fdct8x8_msa, &vpx_idct8x8_64_add_msa, 8, 0, VPX_BITS_8),
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 4, 0,
+                   VPX_BITS_8)));
+#endif  // !CONFIG_EMULATE_HARDWARE
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, TransDCT,
+                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
+                                                     &vpx_idct4x4_16_add_vsx, 4,
+                                                     0, VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+class TransHT : public TransTestBase, public ::testing::TestWithParam<HtParam> {
+ public:
+  TransHT() {
+    fwd_txfm_ref = fht_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride(), tx_type_);
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride(), tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(TransHT, AccuracyCheck) { RunAccuracyCheck(1); }
+
+TEST_P(TransHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransHT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+
+/* TODO:(johannkoenig) Determine why these fail AccuracyCheck
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 0, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 1, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 2, VPX_BITS_12),
+   make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_12, 16, 3, VPX_BITS_12),
+  */
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht16x16_c, &iht16x16_10, 16, 3, VPX_BITS_10),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_10, 8, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht8x8_c, &iht8x8_12, 8, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 1, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 2, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 4, 3, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 1, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 2, VPX_BITS_12),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 4, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 16, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 8, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 4, 3, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, TransHT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 16, 3,
+                   VPX_BITS_8),
+
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 8, 3, VPX_BITS_8),
+
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 4, 3,
+                   VPX_BITS_8)));
+#endif  // HAVE_SSE2
+
+class TransWHT : public TransTestBase,
+                 public ::testing::TestWithParam<DctParam> {
+ public:
+  TransWHT() {
+    fwd_txfm_ref = fwht_ref;
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    size_ = GET_PARAM(2);
+    tx_type_ = GET_PARAM(3);
+    bit_depth_ = GET_PARAM(4);
+    max_pixel_value_ = (1 << bit_depth_) - 1;
+  }
+
+ protected:
+  void RunFwdTxfm(const Buffer<int16_t> &in, Buffer<tran_low_t> *out) {
+    fwd_txfm_(in.TopLeftPixel(), out->TopLeftPixel(), in.stride());
+  }
+
+  void RunInvTxfm(const Buffer<tran_low_t> &in, uint8_t *out) {
+    inv_txfm_(in.TopLeftPixel(), out, in.stride());
+  }
+
+  FdctFunc fwd_txfm_;
+  IdctFunc inv_txfm_;
+};
+
+TEST_P(TransWHT, AccuracyCheck) { RunAccuracyCheck(0); }
+
+TEST_P(TransWHT, CoeffCheck) { RunCoeffCheck(); }
+
+TEST_P(TransWHT, MemCheck) { RunMemCheck(); }
+
+TEST_P(TransWHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, TransWHT,
+    ::testing::Values(
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 4, 0, VPX_BITS_10),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 4, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 4, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(C, TransWHT,
+                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
+                                                     &vpx_iwht4x4_16_add_c, 4,
+                                                     0, VPX_BITS_8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
+                        ::testing::Values(make_tuple(&vp9_fwht4x4_sse2,
+                                                     &vpx_iwht4x4_16_add_sse2,
+                                                     4, 0, VPX_BITS_8)));
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/libvpx/test/decode_test_driver.cc b/libvpx/test/decode_test_driver.cc
index b738e0db1..48680eb8e 100644
--- a/libvpx/test/decode_test_driver.cc
+++ b/libvpx/test/decode_test_driver.cc
@@ -53,13 +53,13 @@ void DecoderTest::HandlePeekResult(Decoder *const decoder,
      * pass it is not a keyframe, so we only expect VPX_CODEC_OK on the first
      * frame, which must be a keyframe. */
     if (video->frame_number() == 0)
-      ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                        << vpx_codec_err_to_string(res_peek);
+      ASSERT_EQ(VPX_CODEC_OK, res_peek)
+          << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
   } else {
     /* The Vp9 implementation of PeekStream returns an error only if the
      * data passed to it isn't a valid Vp9 chunk. */
-    ASSERT_EQ(VPX_CODEC_OK, res_peek) << "Peek return failed: "
-                                      << vpx_codec_err_to_string(res_peek);
+    ASSERT_EQ(VPX_CODEC_OK, res_peek)
+        << "Peek return failed: " << vpx_codec_err_to_string(res_peek);
   }
 }
 
diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc
index f685493aa..87e29b61d 100644
--- a/libvpx/test/encode_api_test.cc
+++ b/libvpx/test/encode_api_test.cc
@@ -79,4 +79,117 @@ TEST(EncodeAPI, HighBitDepthCapability) {
 #endif
 }
 
+#if CONFIG_VP8_ENCODER
+TEST(EncodeAPI, ImageSizeSetting) {
+  const int width = 711;
+  const int height = 360;
+  const int bps = 12;
+  vpx_image_t img;
+  vpx_codec_ctx_t enc;
+  vpx_codec_enc_cfg_t cfg;
+  uint8_t *img_buf = reinterpret_cast<uint8_t *>(
+      calloc(width * height * bps / 8, sizeof(*img_buf)));
+  vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &cfg, 0);
+
+  cfg.g_w = width;
+  cfg.g_h = height;
+
+  vpx_img_wrap(&img, VPX_IMG_FMT_I420, width, height, 1, img_buf);
+
+  vpx_codec_enc_init(&enc, vpx_codec_vp8_cx(), &cfg, 0);
+
+  EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, &img, 0, 1, 0, 0));
+
+  free(img_buf);
+
+  vpx_codec_destroy(&enc);
+}
+#endif
+
+// Set up 2 spatial streams with 2 temporal layers per stream, and generate
+// invalid configuration by setting the temporal layer rate allocation
+// (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
+// CONFIG_MULTI_RES_ENCODING.
+TEST(EncodeAPI, MultiResEncode) {
+  static const vpx_codec_iface_t *kCodecs[] = {
+#if CONFIG_VP8_ENCODER
+    &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+    &vpx_codec_vp9_cx_algo,
+#endif
+  };
+  const int width = 1280;
+  const int height = 720;
+  const int width_down = width / 2;
+  const int height_down = height / 2;
+  const int target_bitrate = 1000;
+  const int framerate = 30;
+
+  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
+    const vpx_codec_iface_t *const iface = kCodecs[c];
+    vpx_codec_ctx_t enc[2];
+    vpx_codec_enc_cfg_t cfg[2];
+    vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
+
+    memset(enc, 0, sizeof(enc));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_enc_config_default(iface, &cfg[i], 0);
+    }
+
+    /* Highest-resolution encoder settings */
+    cfg[0].g_w = width;
+    cfg[0].g_h = height;
+    cfg[0].rc_dropframe_thresh = 0;
+    cfg[0].rc_end_usage = VPX_CBR;
+    cfg[0].rc_resize_allowed = 0;
+    cfg[0].rc_min_quantizer = 2;
+    cfg[0].rc_max_quantizer = 56;
+    cfg[0].rc_undershoot_pct = 100;
+    cfg[0].rc_overshoot_pct = 15;
+    cfg[0].rc_buf_initial_sz = 500;
+    cfg[0].rc_buf_optimal_sz = 600;
+    cfg[0].rc_buf_sz = 1000;
+    cfg[0].g_error_resilient = 1; /* Enable error resilient mode */
+    cfg[0].g_lag_in_frames = 0;
+
+    cfg[0].kf_mode = VPX_KF_AUTO;
+    cfg[0].kf_min_dist = 3000;
+    cfg[0].kf_max_dist = 3000;
+
+    cfg[0].rc_target_bitrate = target_bitrate; /* Set target bitrate */
+    cfg[0].g_timebase.num = 1;                 /* Set fps */
+    cfg[0].g_timebase.den = framerate;
+
+    memcpy(&cfg[1], &cfg[0], sizeof(cfg[0]));
+    cfg[1].rc_target_bitrate = 500;
+    cfg[1].g_w = width_down;
+    cfg[1].g_h = height_down;
+
+    for (int i = 0; i < 2; i++) {
+      cfg[i].ts_number_layers = 2;
+      cfg[i].ts_periodicity = 2;
+      cfg[i].ts_rate_decimator[0] = 2;
+      cfg[i].ts_rate_decimator[1] = 1;
+      cfg[i].ts_layer_id[0] = 0;
+      cfg[i].ts_layer_id[1] = 1;
+      // Invalid parameters.
+      cfg[i].ts_target_bitrate[0] = 0;
+      cfg[i].ts_target_bitrate[1] = 0;
+    }
+
+    // VP9 should report incapable, VP8 invalid for all configurations.
+    const char kVP9Name[] = "WebM Project VP9";
+    const bool is_vp9 = strncmp(kVP9Name, vpx_codec_iface_name(iface),
+                                sizeof(kVP9Name) - 1) == 0;
+    EXPECT_EQ(is_vp9 ? VPX_CODEC_INCAPABLE : VPX_CODEC_INVALID_PARAM,
+              vpx_codec_enc_init_multi(&enc[0], iface, &cfg[0], 2, 0, &dsf[0]));
+
+    for (int i = 0; i < 2; i++) {
+      vpx_codec_destroy(&enc[i]);
+    }
+  }
+}
+
 }  // namespace
diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index 5d2b4008a..b2cbc3f05 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc
@@ -201,6 +201,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
       PreEncodeFrameHook(video, encoder.get());
       encoder->EncodeFrame(video, frame_flags_);
 
+      PostEncodeFrameHook(encoder.get());
+
       CxDataIterator iter = encoder->GetCxData();
 
       bool has_cxdata = false;
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 08a57ad77..89a3b1767 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -139,6 +139,13 @@ class Encoder {
   }
 #endif
 
+#if CONFIG_VP8_ENCODER
+  void Control(int ctrl_id, vpx_roi_map_t *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
+#endif
+
   void Config(const vpx_codec_enc_cfg_t *cfg) {
     const vpx_codec_err_t res = vpx_codec_enc_config_set(&encoder_, cfg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
@@ -212,6 +219,8 @@ class EncoderTest {
   virtual void PreEncodeFrameHook(VideoSource * /*video*/,
                                   Encoder * /*encoder*/) {}
 
+  virtual void PostEncodeFrameHook(Encoder * /*encoder*/) {}
+
   // Hook to be called on every compressed data packet.
   virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {}
 
diff --git a/libvpx/test/external_frame_buffer_test.cc b/libvpx/test/external_frame_buffer_test.cc
index f9686695a..dbf297119 100644
--- a/libvpx/test/external_frame_buffer_test.cc
+++ b/libvpx/test/external_frame_buffer_test.cc
@@ -34,7 +34,8 @@ struct ExternalFrameBuffer {
 // Class to manipulate a list of external frame buffers.
 class ExternalFrameBufferList {
  public:
-  ExternalFrameBufferList() : num_buffers_(0), ext_fb_list_(NULL) {}
+  ExternalFrameBufferList()
+      : num_buffers_(0), num_used_buffers_(0), ext_fb_list_(NULL) {}
 
   virtual ~ExternalFrameBufferList() {
     for (int i = 0; i < num_buffers_; ++i) {
@@ -71,6 +72,8 @@ class ExternalFrameBufferList {
     }
 
     SetFrameBuffer(idx, fb);
+
+    num_used_buffers_++;
     return 0;
   }
 
@@ -106,6 +109,7 @@ class ExternalFrameBufferList {
     }
     EXPECT_EQ(1, ext_fb->in_use);
     ext_fb->in_use = 0;
+    num_used_buffers_--;
     return 0;
   }
 
@@ -121,6 +125,8 @@ class ExternalFrameBufferList {
     }
   }
 
+  int num_used_buffers() const { return num_used_buffers_; }
+
  private:
   // Returns the index of the first free frame buffer. Returns |num_buffers_|
   // if there are no free frame buffers.
@@ -145,6 +151,7 @@ class ExternalFrameBufferList {
   }
 
   int num_buffers_;
+  int num_used_buffers_;
   ExternalFrameBuffer *ext_fb_list_;
 };
 
@@ -220,8 +227,8 @@ class ExternalFrameBufferMD5Test
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
   virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -273,6 +280,7 @@ class ExternalFrameBufferMD5Test
 
 #if CONFIG_WEBM_IO
 const char kVP9TestFile[] = "vp90-2-02-size-lf-1920x1080.webm";
+const char kVP9NonRefTestFile[] = "vp90-2-22-svc_1280x720_1.webm";
 
 // Class for testing passing in external frame buffers to libvpx.
 class ExternalFrameBufferTest : public ::testing::Test {
@@ -292,7 +300,9 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
   virtual void TearDown() {
     delete decoder_;
+    decoder_ = NULL;
     delete video_;
+    video_ = NULL;
   }
 
   // Passes the external frame buffer information to libvpx.
@@ -325,7 +335,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
     return VPX_CODEC_OK;
   }
 
- private:
+ protected:
   void CheckDecodedFrames() {
     libvpx_test::DxDataIterator dec_iter = decoder_->GetDxData();
     const vpx_image_t *img = NULL;
@@ -341,6 +351,25 @@ class ExternalFrameBufferTest : public ::testing::Test {
   int num_buffers_;
   ExternalFrameBufferList fb_list_;
 };
+
+class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
+ protected:
+  virtual void SetUp() {
+    video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
+    ASSERT_TRUE(video_ != NULL);
+    video_->Init();
+    video_->Begin();
+
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    decoder_ = new libvpx_test::VP9Decoder(cfg, 0);
+    ASSERT_TRUE(decoder_ != NULL);
+  }
+
+  virtual void CheckFrameBufferRelease() {
+    TearDown();
+    ASSERT_EQ(0, fb_list_.num_used_buffers());
+  }
+};
 #endif  // CONFIG_WEBM_IO
 
 // This test runs through the set of test vectors, and decodes them.
@@ -419,6 +448,8 @@ TEST_F(ExternalFrameBufferTest, NotEnoughBuffers) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
   ASSERT_EQ(VPX_CODEC_OK, DecodeOneFrame());
+  // Only run this on long clips. Decoding a very short clip will return
+  // VPX_CODEC_OK even with only 2 buffers.
   ASSERT_EQ(VPX_CODEC_MEM_ERROR, DecodeRemainingFrames());
 }
 
@@ -467,6 +498,15 @@ TEST_F(ExternalFrameBufferTest, SetAfterDecode) {
             SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
                                     release_vp9_frame_buffer));
 }
+
+TEST_F(ExternalFrameBufferNonRefTest, ReleaseNonRefFrameBuffer) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
+  ASSERT_EQ(VPX_CODEC_OK,
+            SetFrameBufferFunctions(num_buffers, get_vp9_frame_buffer,
+                                    release_vp9_frame_buffer));
+  ASSERT_EQ(VPX_CODEC_OK, DecodeRemainingFrames());
+  CheckFrameBufferRelease();
+}
 #endif  // CONFIG_WEBM_IO
 
 VP9_INSTANTIATE_TEST_CASE(
diff --git a/libvpx/test/fdct4x4_test.cc b/libvpx/test/fdct4x4_test.cc
deleted file mode 100644
index aa90bfa18..000000000
--- a/libvpx/test/fdct4x4_test.cc
+++ /dev/null
@@ -1,511 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
-#include "vpx/vpx_codec.h"
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-using libvpx_test::ACMRandom;
-
-namespace {
-const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
-typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
-typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
-                        int tx_type);
-
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
-
-void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vpx_fdct4x4_c(in, out, stride);
-}
-
-void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
-  vp9_fht4x4_c(in, out, stride, tx_type);
-}
-
-void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
-                 int /*tx_type*/) {
-  vp9_fwht4x4_c(in, out, stride);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-
-void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 10);
-}
-
-void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
-  vp9_highbd_iht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, tx_type, 12);
-}
-
-void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_iwht4x4_16_add_c(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-
-#if HAVE_SSE2
-void idct4x4_10_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 10);
-}
-
-void idct4x4_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
-  vpx_highbd_idct4x4_16_add_sse2(in, CAST_TO_SHORTPTR(out), stride, 12);
-}
-#endif  // HAVE_SSE2
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(test_input_block, test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(test_temp_block, CAST_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error) max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-      }
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(
-          RunFwdTxfm(input_extreme_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(
-            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
-
-class Trans4x4DCT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fdct4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4DCT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4DCT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4HT : public Trans4x4TestBase,
-                   public ::testing::TestWithParam<Ht4x4Param> {
- public:
-  virtual ~Trans4x4HT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride, tx_type_);
-  }
-
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride, tx_type_);
-  }
-
-  FhtFunc fwd_txfm_;
-  IhtFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4HT, AccuracyCheck) { RunAccuracyCheck(1); }
-
-TEST_P(Trans4x4HT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4HT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
-
-class Trans4x4WHT : public Trans4x4TestBase,
-                    public ::testing::TestWithParam<Dct4x4Param> {
- public:
-  virtual ~Trans4x4WHT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    tx_type_ = GET_PARAM(2);
-    pitch_ = 4;
-    fwd_txfm_ref = fwht4x4_ref;
-    bit_depth_ = GET_PARAM(3);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0); }
-
-TEST_P(Trans4x4WHT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
-using std::tr1::make_tuple;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_c,
-                                                     &vpx_idct4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    C, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
-#else
-INSTANTIATE_TEST_CASE_P(C, Trans4x4WHT,
-                        ::testing::Values(make_tuple(&vp9_fwht4x4_c,
-                                                     &vpx_iwht4x4_16_add_c, 0,
-                                                     VPX_BITS_8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(NEON, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_neon,
-                                                     &vpx_idct4x4_16_add_neon,
-                                                     0, VPX_BITS_8)));
-#if !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    NEON, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4WHT,
-    ::testing::Values(
-        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
-#endif
-
-#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(SSE2, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_sse2,
-                                                     &vpx_idct4x4_16_add_sse2,
-                                                     0, VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4DCT,
-    ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_sse2, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
-
-INSTANTIATE_TEST_CASE_P(
-    SSE2, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-
-#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-INSTANTIATE_TEST_CASE_P(MSA, Trans4x4DCT,
-                        ::testing::Values(make_tuple(&vpx_fdct4x4_msa,
-                                                     &vpx_idct4x4_16_add_msa, 0,
-                                                     VPX_BITS_8)));
-INSTANTIATE_TEST_CASE_P(
-    MSA, Trans4x4HT,
-    ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
-#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-}  // namespace
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index dfbb5dc3d..5021dda9b 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -511,8 +511,8 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error) << "Error: 8x8 IDCT has error " << error
-                             << " at index " << j;
+        EXPECT_EQ(0u, error)
+            << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
   }
@@ -739,7 +739,7 @@ INSTANTIATE_TEST_CASE_P(
     !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
                         ::testing::Values(make_tuple(&vpx_fdct8x8_ssse3,
-                                                     &vpx_idct8x8_64_add_ssse3,
+                                                     &vpx_idct8x8_64_add_sse2,
                                                      0, VPX_BITS_8)));
 #endif
 
@@ -756,4 +756,11 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_msa, &vp9_iht8x8_64_add_msa, 3, VPX_BITS_8)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_CASE_P(VSX, FwdTrans8x8DCT,
+                        ::testing::Values(make_tuple(&vpx_fdct8x8_c,
+                                                     &vpx_idct8x8_64_add_vsx, 0,
+                                                     VPX_BITS_8)));
+#endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc
index a55b15ad0..3b7cfeddc 100644
--- a/libvpx/test/hadamard_test.cc
+++ b/libvpx/test/hadamard_test.cc
@@ -22,7 +22,8 @@ namespace {
 
 using ::libvpx_test::ACMRandom;
 
-typedef void (*HadamardFunc)(const int16_t *a, int a_stride, tran_low_t *b);
+typedef void (*HadamardFunc)(const int16_t *a, ptrdiff_t a_stride,
+                             tran_low_t *b);
 
 void hadamard_loop(const int16_t *a, int a_stride, int16_t *out) {
   int16_t b[8];
@@ -268,6 +269,11 @@ INSTANTIATE_TEST_CASE_P(SSE2, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_CASE_P(AVX2, Hadamard16x16Test,
+                        ::testing::Values(&vpx_hadamard_16x16_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_VSX
 INSTANTIATE_TEST_CASE_P(VSX, Hadamard16x16Test,
                         ::testing::Values(&vpx_hadamard_16x16_vsx));
diff --git a/libvpx/test/idct_test.cc b/libvpx/test/idct_test.cc
index 084b2ed0c..3700374d7 100644
--- a/libvpx/test/idct_test.cc
+++ b/libvpx/test/idct_test.cc
@@ -30,12 +30,15 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
   virtual void SetUp() {
     UUT = GetParam();
 
-    input = new (std::nothrow) Buffer<int16_t>(4, 4, 0);
+    input = new Buffer<int16_t>(4, 4, 0);
     ASSERT_TRUE(input != NULL);
-    predict = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(input->Init());
+    predict = new Buffer<uint8_t>(4, 4, 3);
     ASSERT_TRUE(predict != NULL);
-    output = new (std::nothrow) Buffer<uint8_t>(4, 4, 3);
+    ASSERT_TRUE(predict->Init());
+    output = new Buffer<uint8_t>(4, 4, 3);
     ASSERT_TRUE(output != NULL);
+    ASSERT_TRUE(output->Init());
   }
 
   virtual void TearDown() {
@@ -166,4 +169,9 @@ INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
 INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_msa));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_mmi));
+#endif  // HAVE_MMI
 }
diff --git a/libvpx/test/invalid_file_test.cc b/libvpx/test/invalid_file_test.cc
index eae81faa1..79220b0f6 100644
--- a/libvpx/test/invalid_file_test.cc
+++ b/libvpx/test/invalid_file_test.cc
@@ -45,8 +45,8 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
 
   void OpenResFile(const std::string &res_file_name_) {
     res_file_ = libvpx_test::OpenTestDataFile(res_file_name_);
-    ASSERT_TRUE(res_file_ != NULL) << "Result file open failed. Filename: "
-                                   << res_file_name_;
+    ASSERT_TRUE(res_file_ != NULL)
+        << "Result file open failed. Filename: " << res_file_name_;
   }
 
   virtual bool HandleDecodeResult(
@@ -120,11 +120,23 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
 
 TEST_P(InvalidFileTest, ReturnCode) { RunTest(); }
 
+#if CONFIG_VP8_DECODER
+const DecodeParam kVP8InvalidFileTests[] = {
+  { 1, "invalid-bug-1443.ivf" },
+};
+
+VP8_INSTANTIATE_TEST_CASE(InvalidFileTest,
+                          ::testing::ValuesIn(kVP8InvalidFileTests));
+#endif  // CONFIG_VP8_DECODER
+
 #if CONFIG_VP9_DECODER
 const DecodeParam kVP9InvalidFileTests[] = {
   { 1, "invalid-vp90-02-v2.webm" },
 #if CONFIG_VP9_HIGHBITDEPTH
   { 1, "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf" },
+  { 1,
+    "invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-."
+    "ivf" },
 #endif
   { 1, "invalid-vp90-03-v3.webm" },
   { 1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf" },
@@ -164,12 +176,12 @@ class InvalidFileInvalidPeekTest : public InvalidFileTest {
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
 
 #if CONFIG_VP8_DECODER
-const DecodeParam kVP8InvalidFileTests[] = {
+const DecodeParam kVP8InvalidPeekTests[] = {
   { 1, "invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf" },
 };
 
 VP8_INSTANTIATE_TEST_CASE(InvalidFileInvalidPeekTest,
-                          ::testing::ValuesIn(kVP8InvalidFileTests));
+                          ::testing::ValuesIn(kVP8InvalidPeekTests));
 #endif  // CONFIG_VP8_DECODER
 
 #if CONFIG_VP9_DECODER
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index b87624a11..5862d2649 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -47,8 +47,8 @@ class IVFVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
     uint8_t file_hdr[kIvfFileHdrSize];
diff --git a/libvpx/test/keyframe_test.cc b/libvpx/test/keyframe_test.cc
index 38bd923b7..ee75f401c 100644
--- a/libvpx/test/keyframe_test.cc
+++ b/libvpx/test/keyframe_test.cc
@@ -135,8 +135,8 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
   for (std::vector<vpx_codec_pts_t>::const_iterator iter = kf_pts_list_.begin();
        iter != kf_pts_list_.end(); ++iter) {
     if (deadline_ == VPX_DL_REALTIME && *iter > 0)
-      EXPECT_EQ(0, (*iter - 1) % 30) << "Unexpected keyframe at frame "
-                                     << *iter;
+      EXPECT_EQ(0, (*iter - 1) % 30)
+          << "Unexpected keyframe at frame " << *iter;
     else
       EXPECT_EQ(0, *iter % 30) << "Unexpected keyframe at frame " << *iter;
   }
diff --git a/libvpx/test/level_test.cc b/libvpx/test/level_test.cc
index 85097e94b..26935a81b 100644
--- a/libvpx/test/level_test.cc
+++ b/libvpx/test/level_test.cc
@@ -73,7 +73,7 @@ TEST_P(LevelTest, TestTargetLevel11Large) {
   target_level_ = 11;
   cfg_.rc_target_bitrate = 150;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
 }
 
 TEST_P(LevelTest, TestTargetLevel20Large) {
@@ -83,7 +83,7 @@ TEST_P(LevelTest, TestTargetLevel20Large) {
   target_level_ = 20;
   cfg_.rc_target_bitrate = 1200;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
 }
 
 TEST_P(LevelTest, TestTargetLevel31Large) {
@@ -93,7 +93,7 @@ TEST_P(LevelTest, TestTargetLevel31Large) {
   target_level_ = 31;
   cfg_.rc_target_bitrate = 8000;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(target_level_, level_);
+  ASSERT_GE(target_level_, level_);
 }
 
 // Test for keeping level stats only
@@ -103,11 +103,11 @@ TEST_P(LevelTest, TestTargetLevel0) {
   target_level_ = 0;
   min_gf_internal_ = 4;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(11, level_);
+  ASSERT_GE(11, level_);
 
   cfg_.rc_target_bitrate = 1600;
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  ASSERT_EQ(20, level_);
+  ASSERT_GE(20, level_);
 }
 
 // Test for level control being turned off
@@ -130,7 +130,7 @@ TEST_P(LevelTest, TestTargetLevelApi) {
     if (level == 10 || level == 11 || level == 20 || level == 21 ||
         level == 30 || level == 31 || level == 40 || level == 41 ||
         level == 50 || level == 51 || level == 52 || level == 60 ||
-        level == 61 || level == 62 || level == 0 || level == 255)
+        level == 61 || level == 62 || level == 0 || level == 1 || level == 255)
       EXPECT_EQ(VPX_CODEC_OK,
                 vpx_codec_control(&enc, VP9E_SET_TARGET_LEVEL, level));
     else
diff --git a/libvpx/test/lpf_test.cc b/libvpx/test/lpf_test.cc
index 4fca7d49c..e04b996cd 100644
--- a/libvpx/test/lpf_test.cc
+++ b/libvpx/test/lpf_test.cc
@@ -114,6 +114,18 @@ void InitInput(Pixel *s, Pixel *ref_s, ACMRandom *rnd, const uint8_t limit,
   }
 }
 
+uint8_t GetOuterThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(3 * MAX_LOOP_FILTER + 5));
+}
+
+uint8_t GetInnerThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1));
+}
+
+uint8_t GetHevThresh(ACMRandom *rnd) {
+  return static_cast<uint8_t>(rnd->RandRange(MAX_LOOP_FILTER + 1) >> 4);
+}
+
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
   virtual ~Loop8Test6Param() {}
@@ -162,15 +174,15 @@ TEST_P(Loop8Test6Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -221,15 +233,15 @@ TEST_P(Loop8Test6Param, ValueCheck) {
 
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                    tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -271,27 +283,27 @@ TEST_P(Loop8Test9Param, OperationCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
@@ -334,27 +346,27 @@ TEST_P(Loop8Test9Param, ValueCheck) {
   int first_failure = -1;
   for (int i = 0; i < count_test_block; ++i) {
     int err_count = 0;
-    uint8_t tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    uint8_t tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh0[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(3 * MAX_LOOP_FILTER + 4));
+    tmp = GetOuterThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     blimit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = static_cast<uint8_t>(rnd(MAX_LOOP_FILTER));
+    tmp = GetInnerThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     limit1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                     tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
-    tmp = rnd.Rand8();
+    tmp = GetHevThresh(&rnd);
     DECLARE_ALIGNED(16, const uint8_t,
                     thresh1[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp,
                                      tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp };
diff --git a/libvpx/test/minmax_test.cc b/libvpx/test/minmax_test.cc
index e5c93ed7d..9c119116a 100644
--- a/libvpx/test/minmax_test.cc
+++ b/libvpx/test/minmax_test.cc
@@ -107,10 +107,10 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
       int min_ref, max_ref, min, max;
       reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
       ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
-      EXPECT_EQ(max_ref, max) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
-      EXPECT_EQ(min_ref, min) << "when a_stride = " << a_stride
-                              << " and b_stride = " << b_stride;
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
     }
   }
 }
diff --git a/libvpx/test/partial_idct_test.cc b/libvpx/test/partial_idct_test.cc
index 740d7e202..f7b50f53a 100644
--- a/libvpx/test/partial_idct_test.cc
+++ b/libvpx/test/partial_idct_test.cc
@@ -62,9 +62,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
   virtual ~PartialIDctTest() {}
   virtual void SetUp() {
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    ftxfm_ = GET_PARAM(0);
-    full_itxfm_ = GET_PARAM(1);
-    partial_itxfm_ = GET_PARAM(2);
+    fwd_txfm_ = GET_PARAM(0);
+    full_inv_txfm_ = GET_PARAM(1);
+    partial_inv_txfm_ = GET_PARAM(2);
     tx_size_ = GET_PARAM(3);
     last_nonzero_ = GET_PARAM(4);
     bit_depth_ = GET_PARAM(5);
@@ -128,12 +128,12 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
   }
 
   void InitInput() {
-    const int max_coeff = (32766 << (bit_depth_ - 8)) / 4;
-    int max_energy_leftover = max_coeff * max_coeff;
+    const int64_t max_coeff = (32766 << (bit_depth_ - 8)) / 4;
+    int64_t max_energy_leftover = max_coeff * max_coeff;
     for (int j = 0; j < last_nonzero_; ++j) {
       tran_low_t coeff = static_cast<tran_low_t>(
           sqrt(1.0 * max_energy_leftover) * (rnd_.Rand16() - 32768) / 65536);
-      max_energy_leftover -= coeff * coeff;
+      max_energy_leftover -= static_cast<int64_t>(coeff) * coeff;
       if (max_energy_leftover < 0) {
         max_energy_leftover = 0;
         coeff = 0;
@@ -161,6 +161,14 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
           }
         }
       }
+
+      printf("\ninput_block_:\n");
+      for (int y = 0; y < size_; y++) {
+        for (int x = 0; x < size_; x++) {
+          printf("%6d,", input_block_[y * size_ + x]);
+        }
+        printf("\n");
+      }
     }
   }
 
@@ -177,9 +185,9 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
   int output_block_size_;
   int bit_depth_;
   int mask_;
-  FwdTxfmFunc ftxfm_;
-  InvTxfmWithBdFunc full_itxfm_;
-  InvTxfmWithBdFunc partial_itxfm_;
+  FwdTxfmFunc fwd_txfm_;
+  InvTxfmWithBdFunc full_inv_txfm_;
+  InvTxfmWithBdFunc partial_inv_txfm_;
   ACMRandom rnd_;
 };
 
@@ -213,7 +221,7 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
       }
     }
 
-    ftxfm_(input_extreme_block, output_ref_block, size_);
+    fwd_txfm_(input_extreme_block, output_ref_block, size_);
 
     // quantization with minimum allowed step sizes
     input_block_[0] = (output_ref_block[0] / 4) * 4;
@@ -223,9 +231,9 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
     }
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: partial inverse transform produces different results";
@@ -238,9 +246,9 @@ TEST_P(PartialIDctTest, ResultsMatch) {
     InitInput();
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: partial inverse transform produces different results";
@@ -255,9 +263,9 @@ TEST_P(PartialIDctTest, AddOutputBlock) {
     }
 
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
     ASM_REGISTER_STATE_CHECK(
-        partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+        partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
     ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                         pixel_size_ * output_block_size_))
         << "Error: Transform results are not correctly added to output.";
@@ -278,9 +286,9 @@ TEST_P(PartialIDctTest, SingleExtremeCoeff) {
       input_block_[vp9_default_scan_orders[tx_size_].scan[i]] = coeff;
 
       ASM_REGISTER_STATE_CHECK(
-          full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+          full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
       ASM_REGISTER_STATE_CHECK(
-          partial_itxfm_(input_block_, output_block_, stride_, bit_depth_));
+          partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_));
       ASSERT_EQ(0, memcmp(output_block_ref_, output_block_,
                           pixel_size_ * output_block_size_))
           << "Error: Fails with single coeff of " << coeff << " at " << i
@@ -297,12 +305,12 @@ TEST_P(PartialIDctTest, DISABLED_Speed) {
 
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
     ASM_REGISTER_STATE_CHECK(
-        full_itxfm_(input_block_, output_block_ref_, stride_, bit_depth_));
+        full_inv_txfm_(input_block_, output_block_ref_, stride_, bit_depth_));
   }
   vpx_usec_timer timer;
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    partial_itxfm_(input_block_, output_block_, stride_, bit_depth_);
+    partial_inv_txfm_(input_block_, output_block_, stride_, bit_depth_);
   }
   libvpx_test::ClearSystemState();
   vpx_usec_timer_mark(&timer);
@@ -469,7 +477,9 @@ const PartialInvTxfmParam c_partial_idct_tests[] = {
 INSTANTIATE_TEST_CASE_P(C, PartialIDctTest,
                         ::testing::ValuesIn(c_partial_idct_tests));
 
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_NEON
 const PartialInvTxfmParam neon_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vpx_highbd_fdct32x32_c,
@@ -617,12 +627,42 @@ const PartialInvTxfmParam neon_partial_idct_tests[] = {
 
 INSTANTIATE_TEST_CASE_P(NEON, PartialIDctTest,
                         ::testing::ValuesIn(neon_partial_idct_tests));
-#endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_NEON
 
-#if HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSE2
 // 32x32_135_ is implemented using the 1024 version.
 const PartialInvTxfmParam sse2_partial_idct_tests[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse2>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse2>, TX_32X32, 135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse2>, TX_32X32, 34, 12, 2),
   make_tuple(
       &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_1_add_c>,
       &highbd_wrapper<vpx_highbd_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 2),
@@ -642,6 +682,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse2>, TX_16X16, 256, 12, 2),
   make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse2>, TX_16X16, 38, 12, 2),
+  make_tuple(
       &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
       &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 2),
   make_tuple(
@@ -701,12 +750,16 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_sse2>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
+             &wrapper<vpx_idct32x32_135_add_sse2>, TX_32X32, 135, 8, 1),
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_sse2>, TX_32X32, 34, 8, 1),
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
              &wrapper<vpx_idct32x32_1_add_sse2>, TX_32X32, 1, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_256_add_c>,
              &wrapper<vpx_idct16x16_256_add_sse2>, TX_16X16, 256, 8, 1),
+  make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_38_add_c>,
+             &wrapper<vpx_idct16x16_38_add_sse2>, TX_16X16, 38, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_10_add_c>,
              &wrapper<vpx_idct16x16_10_add_sse2>, TX_16X16, 10, 8, 1),
   make_tuple(&vpx_fdct16x16_c, &wrapper<vpx_idct16x16_1_add_c>,
@@ -726,27 +779,121 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = {
 INSTANTIATE_TEST_CASE_P(SSE2, PartialIDctTest,
                         ::testing::ValuesIn(sse2_partial_idct_tests));
 
-#endif  // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && !CONFIG_EMULATE_HARDWARE
+#if HAVE_SSSE3
 const PartialInvTxfmParam ssse3_partial_idct_tests[] = {
-  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
-             &wrapper<vpx_idct32x32_1024_add_ssse3>, TX_32X32, 1024, 8, 1),
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_135_add_c>,
              &wrapper<vpx_idct32x32_135_add_ssse3>, TX_32X32, 135, 8, 1),
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
              &wrapper<vpx_idct32x32_34_add_ssse3>, TX_32X32, 34, 8, 1),
-  make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_64_add_c>,
-             &wrapper<vpx_idct8x8_64_add_ssse3>, TX_8X8, 64, 8, 1),
   make_tuple(&vpx_fdct8x8_c, &wrapper<vpx_idct8x8_12_add_c>,
              &wrapper<vpx_idct8x8_12_add_ssse3>, TX_8X8, 12, 8, 1)
 };
 
 INSTANTIATE_TEST_CASE_P(SSSE3, PartialIDctTest,
                         ::testing::ValuesIn(ssse3_partial_idct_tests));
-#endif  // HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_EMULATE_HARDWARE
+#endif  // HAVE_SSSE3
 
-#if HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam sse4_1_partial_idct_tests[] = {
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_1024_add_sse4_1>, TX_32X32,
+             1024, 12, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 8, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 10, 2),
+  make_tuple(&vpx_highbd_fdct32x32_c,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_c>,
+             &highbd_wrapper<vpx_highbd_idct32x32_135_add_sse4_1>, TX_32X32,
+             135, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct32x32_c, &highbd_wrapper<vpx_highbd_idct32x32_34_add_c>,
+      &highbd_wrapper<vpx_highbd_idct32x32_34_add_sse4_1>, TX_32X32, 34, 12, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 8, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 10, 2),
+  make_tuple(&vpx_highbd_fdct16x16_c,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_c>,
+             &highbd_wrapper<vpx_highbd_idct16x16_256_add_sse4_1>, TX_16X16,
+             256, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_38_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_38_add_sse4_1>, TX_16X16, 38, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct16x16_c, &highbd_wrapper<vpx_highbd_idct16x16_10_add_c>,
+      &highbd_wrapper<vpx_highbd_idct16x16_10_add_sse4_1>, TX_16X16, 10, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_64_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_64_add_sse4_1>, TX_8X8, 64, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct8x8_c, &highbd_wrapper<vpx_highbd_idct8x8_12_add_c>,
+      &highbd_wrapper<vpx_highbd_idct8x8_12_add_sse4_1>, TX_8X8, 12, 12, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 8, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 10, 2),
+  make_tuple(
+      &vpx_highbd_fdct4x4_c, &highbd_wrapper<vpx_highbd_idct4x4_16_add_c>,
+      &highbd_wrapper<vpx_highbd_idct4x4_16_add_sse4_1>, TX_4X4, 16, 12, 2)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, PartialIDctTest,
+                        ::testing::ValuesIn(sse4_1_partial_idct_tests));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
 const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
              &wrapper<vpx_idct32x32_1024_add_dspr2>, TX_32X32, 1024, 8, 1),
@@ -774,9 +921,9 @@ const PartialInvTxfmParam dspr2_partial_idct_tests[] = {
 
 INSTANTIATE_TEST_CASE_P(DSPR2, PartialIDctTest,
                         ::testing::ValuesIn(dspr2_partial_idct_tests));
-#endif  // HAVE_DSPR2 && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_DSPR2 && !CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 // 32x32_135_ is implemented using the 1024 version.
 const PartialInvTxfmParam msa_partial_idct_tests[] = {
   make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
@@ -805,6 +952,8 @@ const PartialInvTxfmParam msa_partial_idct_tests[] = {
 
 INSTANTIATE_TEST_CASE_P(MSA, PartialIDctTest,
                         ::testing::ValuesIn(msa_partial_idct_tests));
-#endif  // HAVE_MSA && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index 95da09c31..5a2ade1ef 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc
@@ -57,12 +57,14 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
 
   // 5-tap filter needs 2 padding rows above and below the block in the input.
   Buffer<uint8_t> src_image = Buffer<uint8_t>(block_width, block_height, 2);
+  ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
   // Though the left padding is only 8 bytes, the assembly code tries to
   // read 16 bytes before the pointer.
   Buffer<uint8_t> dst_image =
       Buffer<uint8_t>(block_width, block_height, 8, 16, 8, 8);
+  ASSERT_TRUE(dst_image.Init());
 
   uint8_t *const flimits =
       reinterpret_cast<uint8_t *>(vpx_memalign(16, block_width));
@@ -88,8 +90,8 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckFilterOutput) {
   uint8_t *pixel_ptr = dst_image.TopLeftPixel();
   for (int i = 0; i < block_height; ++i) {
     for (int j = 0; j < block_width; ++j) {
-      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j]) << "at (" << i << ", " << j
-                                                  << ")";
+      ASSERT_EQ(kExpectedOutput[i], pixel_ptr[j])
+          << "at (" << i << ", " << j << ")";
     }
     pixel_ptr += dst_image.stride();
   }
@@ -108,6 +110,7 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
   Buffer<uint8_t> src_image =
       Buffer<uint8_t>(block_width, block_height, 2, 2, 10, 2);
+  ASSERT_TRUE(src_image.Init());
 
   // Filter extends output block by 8 samples at left and right edges.
   // Though the left padding is only 8 bytes, there is 'above' padding as well
@@ -116,7 +119,9 @@ TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // SSE2 reads in blocks of 16. Pad an extra 8 in case the width is not %16.
   Buffer<uint8_t> dst_image =
       Buffer<uint8_t>(block_width, block_height, 8, 8, 16, 8);
+  ASSERT_TRUE(dst_image.Init());
   Buffer<uint8_t> dst_image_ref = Buffer<uint8_t>(block_width, block_height, 8);
+  ASSERT_TRUE(dst_image_ref.Init());
 
   // Filter values are set in blocks of 16 for Y and 8 for U/V. Each macroblock
   // can have a different filter. SSE2 assembly reads flimits in blocks of 16 so
@@ -177,8 +182,8 @@ class VpxMbPostProcAcrossIpTest
                      int rows, int cols, int src_pitch) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
-        ASSERT_EQ(expected_output[c], src_c[c]) << "at (" << r << ", " << c
-                                                << ")";
+        ASSERT_EQ(expected_output[c], src_c[c])
+            << "at (" << r << ", " << c << ")";
       }
       src_c += src_pitch;
     }
@@ -197,10 +202,12 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckLowFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
 
   Buffer<uint8_t> expected_output = Buffer<uint8_t>(cols, rows, 0);
+  ASSERT_TRUE(expected_output.Init());
   SetCols(expected_output.TopLeftPixel(), rows, cols, expected_output.stride());
 
   RunFilterLevel(src.TopLeftPixel(), rows, cols, src.stride(), q2mbl(0),
@@ -212,6 +219,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckMediumFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
 
@@ -228,6 +236,7 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckHighFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(src.Init());
   src.SetPadding(10);
   SetCols(src.TopLeftPixel(), rows, cols, src.stride());
 
@@ -249,7 +258,9 @@ TEST_P(VpxMbPostProcAcrossIpTest, CheckCvsAssembly) {
   const int cols = 16;
 
   Buffer<uint8_t> c_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(c_mem.Init());
   Buffer<uint8_t> asm_mem = Buffer<uint8_t>(cols, rows, 8, 8, 17, 8);
+  ASSERT_TRUE(asm_mem.Init());
 
   // When level >= 100, the filter behaves the same as the level = INT_MAX
   // When level < 20, it behaves the same as the level = 0
@@ -285,8 +296,8 @@ class VpxMbPostProcDownTest
                      int rows, int cols, int src_pitch) {
     for (int r = 0; r < rows; r++) {
       for (int c = 0; c < cols; c++) {
-        ASSERT_EQ(expected_output[r * rows + c], src_c[c]) << "at (" << r
-                                                           << ", " << c << ")";
+        ASSERT_EQ(expected_output[r * rows + c], src_c[c])
+            << "at (" << r << ", " << c << ")";
       }
       src_c += src_pitch;
     }
@@ -305,6 +316,7 @@ TEST_P(VpxMbPostProcDownTest, CheckHighFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
   SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -340,6 +352,7 @@ TEST_P(VpxMbPostProcDownTest, CheckMediumFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
   SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -370,6 +383,7 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
   const int cols = 16;
 
   Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
   src_c.SetPadding(10);
 
   SetRows(src_c.TopLeftPixel(), rows, cols, src_c.stride());
@@ -392,7 +406,9 @@ TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
   rnd.Reset(ACMRandom::DeterministicSeed());
 
   Buffer<uint8_t> src_c = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_c.Init());
   Buffer<uint8_t> src_asm = Buffer<uint8_t>(cols, rows, 8, 8, 8, 17);
+  ASSERT_TRUE(src_asm.Init());
 
   for (int level = 0; level < 100; level++) {
     src_c.SetPadding(10);
diff --git a/libvpx/test/predict_test.cc b/libvpx/test/predict_test.cc
index a6e2b3cf3..9f366ae52 100644
--- a/libvpx/test/predict_test.cc
+++ b/libvpx/test/predict_test.cc
@@ -324,6 +324,15 @@ INSTANTIATE_TEST_CASE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_msa)));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_mmi),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_mmi),
+                      make_tuple(8, 4, &vp8_sixtap_predict8x4_mmi),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
diff --git a/libvpx/test/quantize_test.cc b/libvpx/test/quantize_test.cc
index 69da8994c..40bb2642e 100644
--- a/libvpx/test/quantize_test.cc
+++ b/libvpx/test/quantize_test.cc
@@ -200,4 +200,12 @@ INSTANTIATE_TEST_CASE_P(
         make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MSA
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(
+    MMI, QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
+        make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index 84641c8e9..a779e5c06 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h
@@ -113,8 +113,8 @@ class RegisterStateCheck {
     int64_t post_store[8];
     vpx_push_neon(post_store);
     for (int i = 0; i < 8; ++i) {
-      EXPECT_EQ(pre_store_[i], post_store[i]) << "d" << i + 8
-                                              << " has been modified";
+      EXPECT_EQ(pre_store_[i], post_store[i])
+          << "d" << i + 8 << " has been modified";
     }
   }
 
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index c9950dd43..e95dc6651 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -298,10 +298,10 @@ TEST_P(ResizeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 0);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
   }
 }
 
@@ -513,10 +513,10 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
     unsigned int expected_h;
     ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, &expected_w,
                         &expected_h, 1);
-    EXPECT_EQ(expected_w, info->w) << "Frame " << frame
-                                   << " had unexpected width";
-    EXPECT_EQ(expected_h, info->h) << "Frame " << frame
-                                   << " had unexpected height";
+    EXPECT_EQ(expected_w, info->w)
+        << "Frame " << frame << " had unexpected width";
+    EXPECT_EQ(expected_h, info->h)
+        << "Frame " << frame << " had unexpected height";
     EXPECT_EQ(static_cast<unsigned int>(0), GetMismatchFrames());
   }
 }
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index fe3983eb7..67c3c5315 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -644,19 +644,50 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 #if HAVE_NEON
 const SadMxNParam neon_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_neon),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon),
   SadMxNParam(32, 32, &vpx_sad32x32_neon),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon),
   SadMxNParam(16, 16, &vpx_sad16x16_neon),
   SadMxNParam(16, 8, &vpx_sad16x8_neon),
   SadMxNParam(8, 16, &vpx_sad8x16_neon),
   SadMxNParam(8, 8, &vpx_sad8x8_neon),
+  SadMxNParam(8, 4, &vpx_sad8x4_neon),
+  SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+const SadMxNAvgParam avg_neon_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_neon),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_neon),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+};
+INSTANTIATE_TEST_CASE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
+
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon),
   SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_neon),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_neon),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
 };
 INSTANTIATE_TEST_CASE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
@@ -865,6 +896,14 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 INSTANTIATE_TEST_CASE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 #endif  // HAVE_AVX2
 
+#if HAVE_AVX512
+const SadMxNx4Param x4d_avx512_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx512),
+};
+INSTANTIATE_TEST_CASE_P(AVX512, SADx4Test,
+                        ::testing::ValuesIn(x4d_avx512_tests));
+#endif  // HAVE_AVX512
+
 //------------------------------------------------------------------------------
 // MIPS functions
 #if HAVE_MSA
@@ -934,5 +973,84 @@ const SadMxNParam vsx_tests[] = {
   SadMxNParam(16, 8, &vpx_sad16x8_vsx),
 };
 INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests));
+
+const SadMxNAvgParam avg_vsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_vsx),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_vsx),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_vsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_vsx),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_vsx),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_vsx),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_vsx),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADavgTest, ::testing::ValuesIn(avg_vsx_tests));
+
+const SadMxNx4Param x4d_vsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_vsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_vsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_vsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_vsx),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_vsx),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_vsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_vsx),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_vsx),
+};
+INSTANTIATE_TEST_CASE_P(VSX, SADx4Test, ::testing::ValuesIn(x4d_vsx_tests));
 #endif  // HAVE_VSX
+
+//------------------------------------------------------------------------------
+// Loongson functions
+#if HAVE_MMI
+const SadMxNParam mmi_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_mmi),
+  SadMxNParam(64, 32, &vpx_sad64x32_mmi),
+  SadMxNParam(32, 64, &vpx_sad32x64_mmi),
+  SadMxNParam(32, 32, &vpx_sad32x32_mmi),
+  SadMxNParam(32, 16, &vpx_sad32x16_mmi),
+  SadMxNParam(16, 32, &vpx_sad16x32_mmi),
+  SadMxNParam(16, 16, &vpx_sad16x16_mmi),
+  SadMxNParam(16, 8, &vpx_sad16x8_mmi),
+  SadMxNParam(8, 16, &vpx_sad8x16_mmi),
+  SadMxNParam(8, 8, &vpx_sad8x8_mmi),
+  SadMxNParam(8, 4, &vpx_sad8x4_mmi),
+  SadMxNParam(4, 8, &vpx_sad4x8_mmi),
+  SadMxNParam(4, 4, &vpx_sad4x4_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADTest, ::testing::ValuesIn(mmi_tests));
+
+const SadMxNAvgParam avg_mmi_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_mmi),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_mmi),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_mmi),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_mmi),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_mmi),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_mmi),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_mmi),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_mmi),
+  SadMxNAvgParam(8, 16, &vpx_sad8x16_avg_mmi),
+  SadMxNAvgParam(8, 8, &vpx_sad8x8_avg_mmi),
+  SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_mmi),
+  SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_mmi),
+  SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADavgTest, ::testing::ValuesIn(avg_mmi_tests));
+
+const SadMxNx4Param x4d_mmi_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_mmi),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_mmi),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_mmi),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_mmi),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_mmi),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_mmi),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_mmi),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_mmi),
+  SadMxNx4Param(8, 16, &vpx_sad8x16x4d_mmi),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_mmi),
+  SadMxNx4Param(8, 4, &vpx_sad8x4x4d_mmi),
+  SadMxNx4Param(4, 8, &vpx_sad4x8x4d_mmi),
+  SadMxNx4Param(4, 4, &vpx_sad4x4x4d_mmi),
+};
+INSTANTIATE_TEST_CASE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc
index 38711a806..f63954752 100644
--- a/libvpx/test/set_roi.cc
+++ b/libvpx/test/set_roi.cc
@@ -146,14 +146,6 @@ TEST(VP8RoiMapTest, ParameterCheck) {
       if (deltas_valid != roi_retval) break;
     }
 
-    // Test that we report and error if cyclic refresh is enabled.
-    cpi.cyclic_refresh_mode_enabled = 1;
-    roi_retval =
-        vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows, cpi.common.mb_cols,
-                       delta_q, delta_lf, threshold);
-    EXPECT_EQ(-1, roi_retval) << "cyclic refresh check error";
-    cpi.cyclic_refresh_mode_enabled = 0;
-
     // Test invalid number of rows or colums.
     roi_retval =
         vp8_set_roimap(&cpi, roi_map, cpi.common.mb_rows + 1,
diff --git a/libvpx/test/temporal_filter_test.cc b/libvpx/test/temporal_filter_test.cc
index 8615ba45a..655a36be9 100644
--- a/libvpx/test/temporal_filter_test.cc
+++ b/libvpx/test/temporal_filter_test.cc
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <limits>
+
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vp9_rtcd.h"
@@ -35,6 +37,7 @@ void reference_filter(const Buffer<uint8_t> &a, const Buffer<uint8_t> &b, int w,
                       Buffer<unsigned int> *accumulator,
                       Buffer<uint16_t> *count) {
   Buffer<int> diff_sq = Buffer<int>(w, h, 0);
+  ASSERT_TRUE(diff_sq.Init());
   diff_sq.Set(0);
 
   int rounding = 0;
@@ -119,6 +122,7 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
   // Depending on subsampling this function may be called with values of 8 or 16
   // for width and height, in any combination.
   Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
 
   const int filter_weight = 2;
   const int filter_strength = 6;
@@ -127,13 +131,20 @@ TEST_P(TemporalFilterTest, SizeCombinations) {
     for (int height = 8; height <= 16; height += 8) {
       // The second buffer must not have any border.
       Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
       Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
       Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
       Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
       Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
 
-      a.Set(&rnd_, &ACMRandom::Rand8);
-      b.Set(&rnd_, &ACMRandom::Rand8);
+      // The difference between the buffers must be small to pass the threshold
+      // to apply the filter.
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
 
       accum_ref.Set(rnd_.Rand8());
       accum_chk.CopyFrom(accum_ref);
@@ -161,18 +172,32 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
   for (int width = 8; width <= 16; width += 8) {
     for (int height = 8; height <= 16; height += 8) {
       Buffer<uint8_t> a = Buffer<uint8_t>(width, height, 8);
+      ASSERT_TRUE(a.Init());
       // The second buffer must not have any border.
       Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
       Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
       Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
       Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
       Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
 
       for (int filter_strength = 0; filter_strength <= 6; ++filter_strength) {
         for (int filter_weight = 0; filter_weight <= 2; ++filter_weight) {
-          for (int repeat = 0; repeat < 10; ++repeat) {
-            a.Set(&rnd_, &ACMRandom::Rand8);
-            b.Set(&rnd_, &ACMRandom::Rand8);
+          for (int repeat = 0; repeat < 100; ++repeat) {
+            if (repeat < 50) {
+              a.Set(&rnd_, 0, 7);
+              b.Set(&rnd_, 0, 7);
+            } else {
+              // Check large (but close) values as well.
+              a.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+              b.Set(&rnd_, std::numeric_limits<uint8_t>::max() - 7,
+                    std::numeric_limits<uint8_t>::max());
+            }
 
             accum_ref.Set(rnd_.Rand8());
             accum_chk.CopyFrom(accum_ref);
@@ -202,6 +227,7 @@ TEST_P(TemporalFilterTest, CompareReferenceRandom) {
 
 TEST_P(TemporalFilterTest, DISABLED_Speed) {
   Buffer<uint8_t> a = Buffer<uint8_t>(16, 16, 8);
+  ASSERT_TRUE(a.Init());
 
   const int filter_weight = 2;
   const int filter_strength = 6;
@@ -210,13 +236,18 @@ TEST_P(TemporalFilterTest, DISABLED_Speed) {
     for (int height = 8; height <= 16; height += 8) {
       // The second buffer must not have any border.
       Buffer<uint8_t> b = Buffer<uint8_t>(width, height, 0);
+      ASSERT_TRUE(b.Init());
       Buffer<unsigned int> accum_ref = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_ref.Init());
       Buffer<unsigned int> accum_chk = Buffer<unsigned int>(width, height, 0);
+      ASSERT_TRUE(accum_chk.Init());
       Buffer<uint16_t> count_ref = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_ref.Init());
       Buffer<uint16_t> count_chk = Buffer<uint16_t>(width, height, 0);
+      ASSERT_TRUE(count_chk.Init());
 
-      a.Set(&rnd_, &ACMRandom::Rand8);
-      b.Set(&rnd_, &ACMRandom::Rand8);
+      a.Set(&rnd_, 0, 7);
+      b.Set(&rnd_, 0, 7);
 
       accum_chk.Set(0);
       count_chk.Set(0);
diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk
index b39ab8763..f405e4ef1 100644
--- a/libvpx/test/test-data.mk
+++ b/libvpx/test/test-data.mk
@@ -732,6 +732,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp93-2-20-12bit-yuv444.webm.md5
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 # Invalid files for testing libvpx error checking.
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-bug-1443.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += invalid-vp80-00-comprehensive-018.ivf.2kf_0x6.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-01-v3.webm
@@ -772,6 +774,8 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s367
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-07-frame_parallel-1.webm
@@ -874,3 +878,5 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-21-resize_inter_1920x1080_7_3-4.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_3.ivf.md5
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 22ca6f564..99b4e1e46 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -6,6 +6,8 @@ b87815bf86020c592ccc7a846ba2e28ec8043902 *hantro_odd.yuv
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-.ivf.res
 c123d1f9f02fb4143abb5e271916e3a3080de8f6 *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf
 456d1493e52d32a5c30edf44a27debc1fa6b253a *invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf.res
+efafb92b7567bc04c3f1432ea6c268c1c31affd5 *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf
+5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-2-21-resize_inter_320x180_5_3-4.webm.ivf.s45551_r01-05_b6-.ivf.res
 fe346136b9b8c1e6f6084cc106485706915795e4 *invalid-vp90-01-v3.webm
 5d9474c0309b7ca09a182d888f73b37a8fe1362c *invalid-vp90-01-v3.webm.res
 d78e2fceba5ac942246503ec8366f879c4775ca5 *invalid-vp90-02-v2.webm
@@ -848,3 +850,7 @@ a000d568431d07379dd5a8ec066061c07e560b47 *invalid-vp90-2-00-quantizer-63.ivf.kf_
 6fa3d3ac306a3d9ce1d610b78441dc00d2c2d4b9 *tos_vp8.webm
 e402cbbf9e550ae017a1e9f1f73931c1d18474e8 *invalid-crbug-667044.webm
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-crbug-667044.webm.res
+fd9df7f3f6992af1d7a9dde975c9a0d6f28c053d *invalid-bug-1443.ivf
+fd3020fa6e9ca5966206738654c97dec313b0a95 *invalid-bug-1443.ivf.res
+17696cd21e875f1d6e5d418cbf89feab02c8850a *vp90-2-22-svc_1280x720_1.webm
+e2f9e1e47a791b4e939a9bdc50bf7a25b3761f77 *vp90-2-22-svc_1280x720_1.webm.md5
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index c51e645c1..a3716be60 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -39,7 +39,6 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
@@ -124,6 +123,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-yes                   += idct_test.cc
 LIBVPX_TEST_SRCS-yes                   += predict_test.cc
 LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.cc
+LIBVPX_TEST_SRCS-yes                   += vpx_scale_test.h
 
 ifeq ($(CONFIG_VP8_ENCODER)$(CONFIG_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp8_denoiser_sse2_test.cc
@@ -154,11 +154,15 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += avg_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += comp_avg_pred_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct16x16_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct32x32_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_partial_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += dct_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += hadamard_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += minmax_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_scale_test.cc
+ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += temporal_filter_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
diff --git a/libvpx/test/test_intra_pred_speed.cc b/libvpx/test/test_intra_pred_speed.cc
index 23fce335a..1cdeda410 100644
--- a/libvpx/test/test_intra_pred_speed.cc
+++ b/libvpx/test/test_intra_pred_speed.cc
@@ -480,29 +480,70 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c)
 
 #if HAVE_SSE2
-HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4,
-                       vpx_highbd_dc_predictor_4x4_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_4x4_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_4x4_c)
+HIGHBD_INTRA_PRED_TEST(
+    SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2,
+    vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2,
+    vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2,
+    vpx_highbd_h_predictor_4x4_sse2, NULL, vpx_highbd_d135_predictor_4x4_sse2,
+    vpx_highbd_d117_predictor_4x4_sse2, vpx_highbd_d153_predictor_4x4_sse2,
+    vpx_highbd_d207_predictor_4x4_sse2, vpx_highbd_d63_predictor_4x4_sse2,
+    vpx_highbd_tm_predictor_4x4_c)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8,
-                       vpx_highbd_dc_predictor_8x8_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
-                       NULL, NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
+                       vpx_highbd_dc_predictor_8x8_sse2,
+                       vpx_highbd_dc_left_predictor_8x8_sse2,
+                       vpx_highbd_dc_top_predictor_8x8_sse2,
+                       vpx_highbd_dc_128_predictor_8x8_sse2,
+                       vpx_highbd_v_predictor_8x8_sse2,
+                       vpx_highbd_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_tm_predictor_8x8_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_16x16_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
-                       vpx_highbd_tm_predictor_16x16_sse2)
+                       vpx_highbd_dc_predictor_16x16_sse2,
+                       vpx_highbd_dc_left_predictor_16x16_sse2,
+                       vpx_highbd_dc_top_predictor_16x16_sse2,
+                       vpx_highbd_dc_128_predictor_16x16_sse2,
+                       vpx_highbd_v_predictor_16x16_sse2,
+                       vpx_highbd_h_predictor_16x16_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_16x16_sse2)
 
 HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_sse2, NULL, NULL, NULL,
-                       vpx_highbd_v_predictor_32x32_sse2, NULL, NULL, NULL,
-                       NULL, NULL, NULL, NULL,
-                       vpx_highbd_tm_predictor_32x32_sse2)
+                       vpx_highbd_dc_predictor_32x32_sse2,
+                       vpx_highbd_dc_left_predictor_32x32_sse2,
+                       vpx_highbd_dc_top_predictor_32x32_sse2,
+                       vpx_highbd_dc_128_predictor_32x32_sse2,
+                       vpx_highbd_v_predictor_32x32_sse2,
+                       vpx_highbd_h_predictor_32x32_sse2, NULL, NULL, NULL,
+                       NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2)
 #endif  // HAVE_SSE2
 
+#if HAVE_SSSE3
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_4x4_ssse3, NULL,
+                       NULL, NULL, NULL, NULL, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_8x8_ssse3,
+                       vpx_highbd_d135_predictor_8x8_ssse3,
+                       vpx_highbd_d117_predictor_8x8_ssse3,
+                       vpx_highbd_d153_predictor_8x8_ssse3,
+                       vpx_highbd_d207_predictor_8x8_ssse3,
+                       vpx_highbd_d63_predictor_8x8_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_16x16_ssse3,
+                       vpx_highbd_d135_predictor_16x16_ssse3,
+                       vpx_highbd_d117_predictor_16x16_ssse3,
+                       vpx_highbd_d153_predictor_16x16_ssse3,
+                       vpx_highbd_d207_predictor_16x16_ssse3,
+                       vpx_highbd_d63_predictor_16x16_ssse3, NULL)
+HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL,
+                       NULL, NULL, vpx_highbd_d45_predictor_32x32_ssse3,
+                       vpx_highbd_d135_predictor_32x32_ssse3,
+                       vpx_highbd_d117_predictor_32x32_ssse3,
+                       vpx_highbd_d153_predictor_32x32_ssse3,
+                       vpx_highbd_d207_predictor_32x32_ssse3,
+                       vpx_highbd_d63_predictor_32x32_ssse3, NULL)
+#endif  // HAVE_SSSE3
+
 #if HAVE_NEON
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon,
diff --git a/libvpx/test/test_libvpx.cc b/libvpx/test/test_libvpx.cc
index 8a70b4e28..30641ae8c 100644
--- a/libvpx/test/test_libvpx.cc
+++ b/libvpx/test/test_libvpx.cc
@@ -53,6 +53,9 @@ int main(int argc, char **argv) {
   }
   if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
   if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
 #endif  // ARCH_X86 || ARCH_X86_64
 
 #if !CONFIG_SHARED
diff --git a/libvpx/test/test_vector_test.cc b/libvpx/test/test_vector_test.cc
index 14c509d5c..1879b3d27 100644
--- a/libvpx/test/test_vector_test.cc
+++ b/libvpx/test/test_vector_test.cc
@@ -28,13 +28,10 @@
 
 namespace {
 
-enum DecodeMode { kSerialMode, kFrameParallelMode };
+const int kThreads = 0;
+const int kFileName = 1;
 
-const int kDecodeMode = 0;
-const int kThreads = 1;
-const int kFileName = 2;
-
-typedef std::tr1::tuple<int, int, const char *> DecodeParam;
+typedef std::tr1::tuple<int, const char *> DecodeParam;
 
 class TestVectorTest : public ::libvpx_test::DecoderTest,
                        public ::libvpx_test::CodecTestWithParam<DecodeParam> {
@@ -53,8 +50,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 
   void OpenMD5File(const std::string &md5_file_name_) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name_);
-    ASSERT_TRUE(md5_file_ != NULL) << "Md5 file open failed. Filename: "
-                                   << md5_file_name_;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
   virtual void DecompressedFrameHook(const vpx_image_t &img,
@@ -92,29 +89,14 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 TEST_P(TestVectorTest, MD5Match) {
   const DecodeParam input = GET_PARAM(1);
   const std::string filename = std::tr1::get<kFileName>(input);
-  const int threads = std::tr1::get<kThreads>(input);
-  const int mode = std::tr1::get<kDecodeMode>(input);
   vpx_codec_flags_t flags = 0;
   vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
   char str[256];
 
-  if (mode == kFrameParallelMode) {
-    flags |= VPX_CODEC_USE_FRAME_THREADING;
-#if CONFIG_VP9_DECODER
-    // TODO(hkuang): Fix frame parallel decode bug. See issue 1086.
-    if (resize_clips_.find(filename) != resize_clips_.end()) {
-      printf("Skipping the test file: %s, due to frame parallel decode bug.\n",
-             filename.c_str());
-      return;
-    }
-#endif
-  }
-
-  cfg.threads = threads;
+  cfg.threads = std::tr1::get<kThreads>(input);
 
-  snprintf(str, sizeof(str) / sizeof(str[0]) - 1,
-           "file: %s  mode: %s threads: %d", filename.c_str(),
-           mode == 0 ? "Serial" : "Parallel", threads);
+  snprintf(str, sizeof(str) / sizeof(str[0]) - 1, "file: %s threads: %d",
+           filename.c_str(), cfg.threads);
   SCOPED_TRACE(str);
 
   // Open compressed video file.
@@ -145,13 +127,10 @@ TEST_P(TestVectorTest, MD5Match) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get(), cfg));
 }
 
-// Test VP8 decode in serial mode with single thread.
-// NOTE: VP8 only support serial mode.
 #if CONFIG_VP8_DECODER
 VP8_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
         ::testing::Values(1),  // Single thread.
         ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                             libvpx_test::kVP8TestVectors +
@@ -164,33 +143,28 @@ INSTANTIATE_TEST_CASE_P(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP8)),
         ::testing::Combine(
-            ::testing::Values(0),    // Serial Mode.
-            ::testing::Range(1, 8),  // With 1 ~ 8 threads.
+            ::testing::Range(2, 9),  // With 2 ~ 8 threads.
             ::testing::ValuesIn(libvpx_test::kVP8TestVectors,
                                 libvpx_test::kVP8TestVectors +
                                     libvpx_test::kNumVP8TestVectors))));
 
 #endif  // CONFIG_VP8_DECODER
 
-// Test VP9 decode in serial mode with single thread.
 #if CONFIG_VP9_DECODER
 VP9_INSTANTIATE_TEST_CASE(
     TestVectorTest,
     ::testing::Combine(
-        ::testing::Values(0),  // Serial Mode.
         ::testing::Values(1),  // Single thread.
         ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                             libvpx_test::kVP9TestVectors +
                                 libvpx_test::kNumVP9TestVectors)));
 
-// Test VP9 decode in frame parallel mode with different number of threads.
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP9MultiThreadedFrameParallel, TestVectorTest,
+    VP9MultiThreaded, TestVectorTest,
     ::testing::Combine(
         ::testing::Values(
             static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)),
         ::testing::Combine(
-            ::testing::Values(1),    // Frame Parallel mode.
             ::testing::Range(2, 9),  // With 2 ~ 8 threads.
             ::testing::ValuesIn(libvpx_test::kVP9TestVectors,
                                 libvpx_test::kVP9TestVectors +
diff --git a/libvpx/test/test_vectors.cc b/libvpx/test/test_vectors.cc
index def78da28..3ffc3efc4 100644
--- a/libvpx/test/test_vectors.cc
+++ b/libvpx/test/test_vectors.cc
@@ -371,6 +371,7 @@ const char *const kVP9TestVectors[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   "vp90-2-20-big_superframe-01.webm",
   "vp90-2-20-big_superframe-02.webm",
+  "vp90-2-22-svc_1280x720_1.webm",
   RESIZE_TEST_VECTORS
 };
 const char *const kVP9TestVectorsSvc[] = { "vp90-2-22-svc_1280x720_3.ivf" };
diff --git a/libvpx/test/twopass_encoder.sh b/libvpx/test/twopass_encoder.sh
index 7a223f2af..eaeaabdfd 100755
--- a/libvpx/test/twopass_encoder.sh
+++ b/libvpx/test/twopass_encoder.sh
@@ -54,7 +54,10 @@ twopass_encoder_vp9() {
   fi
 }
 
-twopass_encoder_tests="twopass_encoder_vp8
-                       twopass_encoder_vp9"
 
-run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  twopass_encoder_tests="twopass_encoder_vp8
+                         twopass_encoder_vp9"
+
+  run_tests twopass_encoder_verify_environment "${twopass_encoder_tests}"
+fi
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index d607a097d..421024ad8 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -492,7 +492,7 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
 
   vpx_usec_timer timer;
   vpx_usec_timer_start(&timer);
-  for (int i = 0; i < 100000000 / block_size(); ++i) {
+  for (int i = 0; i < (1 << 30) / block_size(); ++i) {
     const uint32_t variance = params_.func(src_, width(), ref_, width(), &sse);
     // Ignore return value.
     (void)variance;
@@ -561,46 +561,26 @@ void MainTestClass<FunctionType>::MaxTestSse() {
 
 ////////////////////////////////////////////////////////////////////////////////
 
-using ::std::tr1::get;
-using ::std::tr1::make_tuple;
-using ::std::tr1::tuple;
-
-template <typename SubpelVarianceFunctionType>
+template <typename FunctionType>
 class SubpelVarianceTest
-    : public ::testing::TestWithParam<
-          tuple<int, int, SubpelVarianceFunctionType, int> > {
+    : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
   virtual void SetUp() {
-    const tuple<int, int, SubpelVarianceFunctionType, int> &params =
-        this->GetParam();
-    log2width_ = get<0>(params);
-    width_ = 1 << log2width_;
-    log2height_ = get<1>(params);
-    height_ = 1 << log2height_;
-    subpel_variance_ = get<2>(params);
-    if (get<3>(params)) {
-      bit_depth_ = (vpx_bit_depth_t)get<3>(params);
-      use_high_bit_depth_ = true;
-    } else {
-      bit_depth_ = VPX_BITS_8;
-      use_high_bit_depth_ = false;
-    }
-    mask_ = (1 << bit_depth_) - 1;
+    params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
-    block_size_ = width_ * height_;
-    if (!use_high_bit_depth_) {
-      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-      ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+    if (!use_high_bit_depth()) {
+      src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size()));
+      ref_ = new uint8_t[block_size() + width() + height() + 1];
 #if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
       sec_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
-          vpx_memalign(16, block_size_ * sizeof(uint16_t))));
-      ref_ =
-          CONVERT_TO_BYTEPTR(new uint16_t[block_size_ + width_ + height_ + 1]);
+          vpx_memalign(16, block_size() * sizeof(uint16_t))));
+      ref_ = CONVERT_TO_BYTEPTR(
+          new uint16_t[block_size() + width() + height() + 1]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
     ASSERT_TRUE(src_ != NULL);
@@ -609,7 +589,7 @@ class SubpelVarianceTest
   }
 
   virtual void TearDown() {
-    if (!use_high_bit_depth_) {
+    if (!use_high_bit_depth()) {
       vpx_free(src_);
       delete[] ref_;
       vpx_free(sec_);
@@ -631,42 +611,45 @@ class SubpelVarianceTest
   uint8_t *src_;
   uint8_t *ref_;
   uint8_t *sec_;
-  bool use_high_bit_depth_;
-  vpx_bit_depth_t bit_depth_;
-  int width_, log2width_;
-  int height_, log2height_;
-  int block_size_, mask_;
-  SubpelVarianceFunctionType subpel_variance_;
+  TestParams<FunctionType> params_;
+
+  // some relay helpers
+  bool use_high_bit_depth() const { return params_.use_high_bit_depth; }
+  int byte_shift() const { return params_.bit_depth - 8; }
+  int block_size() const { return params_.block_size; }
+  int width() const { return params_.width; }
+  int height() const { return params_.height; }
+  uint32_t mask() const { return params_.mask; }
 };
 
 template <typename SubpelVarianceFunctionType>
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -680,28 +663,28 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
   // Ref: Set the first half of values to the maximum, the second half to 0.
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      const int half = block_size_ / 2;
-      if (!use_high_bit_depth_) {
+      const int half = block_size() / 2;
+      if (!use_high_bit_depth()) {
         memset(src_, 0, half);
         memset(src_ + half, 255, half);
         memset(ref_, 255, half);
-        memset(ref_ + half, 0, half + width_ + height_ + 1);
+        memset(ref_ + half, 0, half + width() + height() + 1);
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask(), half);
         vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
-        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
-                     half + width_ + height_ + 1);
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask(),
+                     half + width() + height() + 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
-          var1 = subpel_variance_(ref_, width_ + 1, x, y, src_, width_, &sse1));
-      const unsigned int var2 =
-          subpel_variance_ref(ref_, src_, log2width_, log2height_, x, y, &sse2,
-                              use_high_bit_depth_, bit_depth_);
+          var1 = params_.func(ref_, width() + 1, x, y, src_, width(), &sse1));
+      const unsigned int var2 = subpel_variance_ref(
+          ref_, src_, params_.log2width, params_.log2height, x, y, &sse2,
+          use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "for xoffset " << x << " and yoffset " << y;
       EXPECT_EQ(var1, var2) << "for xoffset " << x << " and yoffset " << y;
     }
@@ -712,33 +695,32 @@ template <>
 void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
-      if (!use_high_bit_depth_) {
-        for (int j = 0; j < block_size_; j++) {
+      if (!use_high_bit_depth()) {
+        for (int j = 0; j < block_size(); j++) {
           src_[j] = rnd_.Rand8();
           sec_[j] = rnd_.Rand8();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
 #if CONFIG_VP9_HIGHBITDEPTH
       } else {
-        for (int j = 0; j < block_size_; j++) {
-          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
-          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size(); j++) {
+          CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+          CONVERT_TO_SHORTPTR(sec_)[j] = rnd_.Rand16() & mask();
         }
-        for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
+        for (int j = 0; j < block_size() + width() + height() + 1; j++) {
+          CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
       uint32_t sse1, sse2;
       uint32_t var1, var2;
-      ASM_REGISTER_STATE_CHECK(var1 =
-                                   subpel_variance_(ref_, width_ + 1, x, y,
-                                                    src_, width_, &sse1, sec_));
-      var2 = subpel_avg_variance_ref(ref_, src_, sec_, log2width_, log2height_,
-                                     x, y, &sse2, use_high_bit_depth_,
-                                     static_cast<vpx_bit_depth_t>(bit_depth_));
+      ASM_REGISTER_STATE_CHECK(var1 = params_.func(ref_, width() + 1, x, y,
+                                                   src_, width(), &sse1, sec_));
+      var2 = subpel_avg_variance_ref(ref_, src_, sec_, params_.log2width,
+                                     params_.log2height, x, y, &sse2,
+                                     use_high_bit_depth(), params_.bit_depth);
       EXPECT_EQ(sse1, sse2) << "at position " << x << ", " << y;
       EXPECT_EQ(var1, var2) << "at position " << x << ", " << y;
     }
@@ -798,37 +780,41 @@ INSTANTIATE_TEST_CASE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_c),
                       VarianceParams(2, 2, &vpx_variance4x4_c)));
 
+typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
-
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+
+typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
@@ -850,18 +836,18 @@ TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(make_tuple(4, 4, &vpx_highbd_12_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_12_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_10_mse8x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse16x8_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x16_c),
-                      make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
+                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
 */
 
 INSTANTIATE_TEST_CASE_P(
@@ -909,88 +895,161 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c,
+                             12)));
 
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+        SubpelAvgVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c,
+                                8),
+        SubpelAvgVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c,
+                                8),
+        SubpelAvgVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c,
+                                8),
+        SubpelAvgVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c,
+                                8),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_c,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_c,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_c,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_c,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_c,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_c,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_c,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_c,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_c,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_c,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_c,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_c,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_c,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_c,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_c,
+                                12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_SSE2
@@ -1021,36 +1080,37 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_sse2, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_sse2, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_sse2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_sse2, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_sse2, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_sse2, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_sse2, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_sse2, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_sse2, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_sse2, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_sse2, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_sse2, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_sse2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_sse2, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_sse2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_sse2, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_sse2, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_sse2, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_sse2, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_sse2, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_sse2, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_sse2, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
@@ -1107,112 +1167,219 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2, 8)));
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_sse2,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_sse2,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_sse2,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_sse2,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_sse2,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_sse2,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_sse2,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_sse2,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_sse2,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_sse2,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_sse2,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_sse2,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_sse2,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_sse2,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_sse2,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_sse2,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_sse2,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_sse2,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_sse2,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_sse2,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_sse2,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_sse2,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_sse2,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_sse2, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_sse2,
+                             8)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2, 12),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2, 10),
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2, 8)));
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_sse2,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_sse2,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_sse2,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_sse2,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_sse2,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_sse2,
+                                8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_ssse3, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_ssse3, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_ssse3, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_ssse3, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_ssse3, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_ssse3, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_ssse3, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_ssse3, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_ssse3, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_ssse3, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_ssse3, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_ssse3, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_ssse3, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3, 0),
-        make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3, 0),
-        make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3, 0),
-        make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3, 0),
-        make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3, 0),
-        make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3, 0),
-        make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
-        make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
-        make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
-        make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
-        make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
-        make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_ssse3,
+                                0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_ssse3, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_ssse3, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_ssse3, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_ssse3, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_ssse3,
+                                0)));
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX2
@@ -1229,14 +1396,16 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_avx2, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_avx2, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     AVX2, VpxSubpelAvgVarianceTest,
     ::testing::Values(
-        make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
-        make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0)));
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_avx2, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2,
+                                0)));
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
@@ -1265,17 +1434,37 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     NEON, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_neon, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_neon, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_neon, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_neon, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_neon, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_neon, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_neon, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_neon, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_neon, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_neon, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_neon, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_neon, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_neon, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_neon, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_neon, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_neon, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_neon, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_neon, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_neon, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_neon, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_neon, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_neon, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_neon, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_neon, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -1310,35 +1499,37 @@ INSTANTIATE_TEST_CASE_P(
 
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
-                      make_tuple(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
+    ::testing::Values(
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_msa, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_msa, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_msa, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_msa, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_msa, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_msa, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_msa, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_msa, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_msa, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_msa, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_msa, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_msa, 0),
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_msa, 0)));
 
 INSTANTIATE_TEST_CASE_P(
     MSA, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_msa, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_msa, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_msa, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_msa, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_msa, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_msa, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_msa, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_msa, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_msa, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_msa, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_msa, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_msa, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_msa, 0)));
 #endif  // HAVE_MSA
 
 #if HAVE_VSX
@@ -1349,4 +1540,62 @@ INSTANTIATE_TEST_CASE_P(VSX, VpxSseTest,
                         ::testing::Values(SseParams(2, 2,
                                                     &vpx_get4x4sse_cs_vsx)));
 #endif  // HAVE_VSX
+
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VpxMseTest,
+                        ::testing::Values(MseParams(4, 4, &vpx_mse16x16_mmi),
+                                          MseParams(4, 3, &vpx_mse16x8_mmi),
+                                          MseParams(3, 4, &vpx_mse8x16_mmi),
+                                          MseParams(3, 3, &vpx_mse8x8_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_mmi),
+                      VarianceParams(6, 5, &vpx_variance64x32_mmi),
+                      VarianceParams(5, 6, &vpx_variance32x64_mmi),
+                      VarianceParams(5, 5, &vpx_variance32x32_mmi),
+                      VarianceParams(5, 4, &vpx_variance32x16_mmi),
+                      VarianceParams(4, 5, &vpx_variance16x32_mmi),
+                      VarianceParams(4, 4, &vpx_variance16x16_mmi),
+                      VarianceParams(4, 3, &vpx_variance16x8_mmi),
+                      VarianceParams(3, 4, &vpx_variance8x16_mmi),
+                      VarianceParams(3, 3, &vpx_variance8x8_mmi),
+                      VarianceParams(3, 2, &vpx_variance8x4_mmi),
+                      VarianceParams(2, 3, &vpx_variance4x8_mmi),
+                      VarianceParams(2, 2, &vpx_variance4x4_mmi)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_sub_pixel_variance64x64_mmi, 0),
+        SubpelVarianceParams(6, 5, &vpx_sub_pixel_variance64x32_mmi, 0),
+        SubpelVarianceParams(5, 6, &vpx_sub_pixel_variance32x64_mmi, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_mmi, 0),
+        SubpelVarianceParams(5, 4, &vpx_sub_pixel_variance32x16_mmi, 0),
+        SubpelVarianceParams(4, 5, &vpx_sub_pixel_variance16x32_mmi, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_mmi, 0),
+        SubpelVarianceParams(4, 3, &vpx_sub_pixel_variance16x8_mmi, 0),
+        SubpelVarianceParams(3, 4, &vpx_sub_pixel_variance8x16_mmi, 0),
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_mmi, 0),
+        SubpelVarianceParams(3, 2, &vpx_sub_pixel_variance8x4_mmi, 0),
+        SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_mmi, 0),
+        SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_mmi, 0)));
+
+INSTANTIATE_TEST_CASE_P(
+    MMI, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6, &vpx_sub_pixel_avg_variance64x64_mmi, 0),
+        SubpelAvgVarianceParams(6, 5, &vpx_sub_pixel_avg_variance64x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 6, &vpx_sub_pixel_avg_variance32x64_mmi, 0),
+        SubpelAvgVarianceParams(5, 5, &vpx_sub_pixel_avg_variance32x32_mmi, 0),
+        SubpelAvgVarianceParams(5, 4, &vpx_sub_pixel_avg_variance32x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 5, &vpx_sub_pixel_avg_variance16x32_mmi, 0),
+        SubpelAvgVarianceParams(4, 4, &vpx_sub_pixel_avg_variance16x16_mmi, 0),
+        SubpelAvgVarianceParams(4, 3, &vpx_sub_pixel_avg_variance16x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 4, &vpx_sub_pixel_avg_variance8x16_mmi, 0),
+        SubpelAvgVarianceParams(3, 3, &vpx_sub_pixel_avg_variance8x8_mmi, 0),
+        SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_mmi, 0),
+        SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
+        SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc
index 9f69ae164..b7697d859 100644
--- a/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libvpx/test/vp8_fdct4x4_test.cc
@@ -199,4 +199,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, FdctTest,
 INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
                         ::testing::Values(vp8_short_fdct4x4_msa));
 #endif  // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+                        ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif  // HAVE_MMI
 }  // namespace
diff --git a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
index 53dc8c9fe..62e8dcb9b 100644
--- a/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/libvpx/test/vp9_encoder_parms_get_to_decoder.cc
@@ -99,9 +99,7 @@ class VpxEncoderParmsGetToDecoder
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
-    FrameWorkerData *const worker_data =
-        reinterpret_cast<FrameWorkerData *>(priv->frame_workers[0].data1);
-    VP9_COMMON *const common = &worker_data->pbi->common;
+    VP9_COMMON *const common = &priv->pbi->common;
 
     if (encode_parms.lossless) {
       EXPECT_EQ(0, common->base_qindex);
diff --git a/libvpx/test/vp9_ethread_test.cc b/libvpx/test/vp9_ethread_test.cc
index 4e8d814c1..6b7e51211 100644
--- a/libvpx/test/vp9_ethread_test.cc
+++ b/libvpx/test/vp9_ethread_test.cc
@@ -50,7 +50,6 @@ class VPxFirstPassEncoderThreadTest
     InitializeConfig();
     SetMode(encoding_mode_);
 
-    cfg_.g_lag_in_frames = 3;
     cfg_.rc_end_usage = VPX_VBR;
     cfg_.rc_2pass_vbr_minsection_pct = 5;
     cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -128,8 +127,10 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats, double factor) {
     const double *frame_stats2 = reinterpret_cast<double *>(stats2);
 
     for (j = 0; j < kDbl; ++j) {
-      EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),
-                fabs(*frame_stats1) / factor);
+      ASSERT_LE(fabs(*frame_stats1 - *frame_stats2),
+                fabs(*frame_stats1) / factor)
+          << "First failure @ frame #" << i << " stat #" << j << " ("
+          << *frame_stats1 << " vs. " << *frame_stats2 << ")";
       frame_stats1++;
       frame_stats2++;
     }
@@ -183,7 +184,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if using or not using row-mt generates close stats.
-  compare_fp_stats(&firstpass_stats_, 1000.0);
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
 
   // Test single thread vs multiple threads
   row_mt_mode_ = 1;
@@ -197,7 +198,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if single-thread and multi-thread stats are close enough.
-  compare_fp_stats(&firstpass_stats_, 1000.0);
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
 
   // Bit exact test in row_mt mode.
   // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
@@ -238,7 +239,6 @@ class VPxEncoderThreadTest
     SetMode(encoding_mode_);
 
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
-      cfg_.g_lag_in_frames = 3;
       cfg_.rc_end_usage = VPX_VBR;
       cfg_.rc_2pass_vbr_minsection_pct = 5;
       cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -340,8 +340,6 @@ TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
   ASSERT_EQ(single_thr_md5, multi_thr_md5);
 
   // Part 2: row_mt_mode_ = 0 vs row_mt_mode_ = 1 single thread bit exact test.
-  // The first-pass stats are not bit exact here, but that difference doesn't
-  // cause a mismatch between the final bitstreams.
   row_mt_mode_ = 1;
 
   // Encode using single thread
diff --git a/libvpx/test/vp9_frame_parallel_test.cc b/libvpx/test/vp9_frame_parallel_test.cc
deleted file mode 100644
index 136557720..000000000
--- a/libvpx/test/vp9_frame_parallel_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <cstdio>
-#include <cstdlib>
-#include <string>
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "./vpx_config.h"
-#include "test/codec_factory.h"
-#include "test/decode_test_driver.h"
-#include "test/ivf_video_source.h"
-#include "test/md5_helper.h"
-#include "test/util.h"
-#if CONFIG_WEBM_IO
-#include "test/webm_video_source.h"
-#endif
-#include "vpx_mem/vpx_mem.h"
-
-namespace {
-
-using std::string;
-
-#if CONFIG_WEBM_IO
-
-struct PauseFileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include skipped frames.
-  const char *expected_md5;
-  const int pause_frame_num;
-};
-
-// Decodes |filename| with |num_threads|. Pause at the specified frame_num,
-// seek to next key frame and then continue decoding until the end. Return
-// the md5 of the decoded frames which does not include skipped frames.
-string DecodeFileWithPause(const string &filename, int num_threads,
-                           int pause_num) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-  int in_frames = 0;
-  int out_frames = 0;
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  vpx_codec_flags_t flags = 0;
-  flags |= VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  do {
-    ++in_frames;
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    // Pause at specified frame number.
-    if (in_frames == pause_num) {
-      // Flush the decoder and then seek to next key frame.
-      decoder.DecodeFrame(NULL, 0);
-      video.SeekToNextKeyFrame();
-    } else {
-      video.Next();
-    }
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(in_frames, out_frames)
-      << "Input frame count does not match output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFilesWithPause(const PauseFileList files[]) {
-  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, PauseSeekResume) {
-  // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-  // one key frame for every ten frames.
-  static const PauseFileList files[] = {
-    { "vp90-2-07-frame_parallel-1.webm", "6ea7c3875d67252e7caf2bc6e75b36b1",
-      6 },
-    { "vp90-2-07-frame_parallel-1.webm", "4bb634160c7356a8d7d4299b6dc83a45",
-      12 },
-    { "vp90-2-07-frame_parallel-1.webm", "89772591e6ef461f9fa754f916c78ed8",
-      26 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFilesWithPause(files);
-}
-
-struct FileList {
-  const char *name;
-  // md5 sum for decoded frames which does not include corrupted frames.
-  const char *expected_md5;
-  // Expected number of decoded frames which does not include corrupted frames.
-  const int expected_frame_count;
-};
-
-// Decodes |filename| with |num_threads|. Return the md5 of the decoded
-// frames which does not include corrupted frames.
-string DecodeFile(const string &filename, int num_threads,
-                  int expected_frame_count) {
-  libvpx_test::WebMVideoSource video(filename);
-  video.Init();
-
-  vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
-  cfg.threads = num_threads;
-  const vpx_codec_flags_t flags = VPX_CODEC_USE_FRAME_THREADING;
-  libvpx_test::VP9Decoder decoder(cfg, flags);
-
-  libvpx_test::MD5 md5;
-  video.Begin();
-
-  int out_frames = 0;
-  do {
-    const vpx_codec_err_t res =
-        decoder.DecodeFrame(video.cxdata(), video.frame_size());
-    // TODO(hkuang): frame parallel mode should return an error on corruption.
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res) << decoder.DecodeError();
-      break;
-    }
-
-    video.Next();
-
-    // Flush the decoder at the end of the video.
-    if (!video.cxdata()) decoder.DecodeFrame(NULL, 0);
-
-    libvpx_test::DxDataIterator dec_iter = decoder.GetDxData();
-    const vpx_image_t *img;
-
-    // Get decompressed data
-    while ((img = dec_iter.Next())) {
-      ++out_frames;
-      md5.Add(img);
-    }
-  } while (video.cxdata() != NULL);
-
-  EXPECT_EQ(expected_frame_count, out_frames)
-      << "Input frame count does not match expected output frame count";
-
-  return string(md5.Get());
-}
-
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 2; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->expected_frame_count))
-          << "threads = " << t;
-    }
-  }
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const FileList files[] = {
-    // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 11th frame has corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
-    // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 1st and 31st frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
-    // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
-    // one key frame for every ten frames. The 5th and 13th frames have
-    // corrupted data.
-    { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27 },
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-
-TEST(DISABLED_VP9MultiThreadedFrameParallel, ValidFileTest) {
-  static const FileList files[] = {
-#if CONFIG_VP9_HIGHBITDEPTH
-    { "vp92-2-20-10bit-yuv420.webm", "a16b99df180c584e8db2ffeda987d293", 10 },
-#endif
-    { NULL, NULL, 0 },
-  };
-  DecodeFiles(files);
-}
-#endif  // CONFIG_WEBM_IO
-}  // namespace
diff --git a/libvpx/test/vp9_intrapred_test.cc b/libvpx/test/vp9_intrapred_test.cc
index bee0213ea..39c5e79eb 100644
--- a/libvpx/test/vp9_intrapred_test.cc
+++ b/libvpx/test/vp9_intrapred_test.cc
@@ -467,10 +467,164 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) {
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_8, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_10, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSSE3_TO_C_12, VP9HighbdIntraPredTest,
+    ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_4x4_ssse3,
+                             &vpx_highbd_d45_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_8x8_ssse3,
+                             &vpx_highbd_d45_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_16x16_ssse3,
+                             &vpx_highbd_d45_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_ssse3,
+                             &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_ssse3,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_ssse3,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_c,
+                             &vpx_highbd_d63_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_ssse3,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_ssse3,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_c,
+                             &vpx_highbd_d117_predictor_32x32_ssse3, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_ssse3,
+                             &vpx_highbd_d135_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_16x16_ssse3,
+                             &vpx_highbd_d135_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_ssse3,
+                             &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_ssse3,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_ssse3,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_ssse3,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12)));
+#endif  // HAVE_SSSE3
+
 #if HAVE_SSE2
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_8, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -479,6 +633,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -487,6 +649,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -499,6 +669,32 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_10, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -507,6 +703,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -515,6 +719,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
@@ -527,6 +739,32 @@ INSTANTIATE_TEST_CASE_P(
 INSTANTIATE_TEST_CASE_P(
     SSE2_TO_C_12, VP9HighbdIntraPredTest,
     ::testing::Values(
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_sse2,
+                             &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_sse2,
+                             &vpx_highbd_dc_128_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_16x16_sse2,
+                             &vpx_highbd_dc_128_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2,
+                             &vpx_highbd_dc_128_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_sse2,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_sse2,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_sse2,
+                             &vpx_highbd_d135_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_sse2,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2,
+                             &vpx_highbd_dc_left_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2,
+                             &vpx_highbd_dc_left_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_16x16_sse2,
+                             &vpx_highbd_dc_left_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_32x32_sse2,
+                             &vpx_highbd_dc_left_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_4x4_sse2,
                              &vpx_highbd_dc_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_8x8_sse2,
@@ -535,6 +773,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_dc_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_predictor_32x32_sse2,
                              &vpx_highbd_dc_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_4x4_sse2,
+                             &vpx_highbd_dc_top_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_8x8_sse2,
+                             &vpx_highbd_dc_top_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_16x16_sse2,
+                             &vpx_highbd_dc_top_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_dc_top_predictor_32x32_sse2,
+                             &vpx_highbd_dc_top_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_4x4_sse2,
                              &vpx_highbd_tm_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_8x8_sse2,
@@ -543,6 +789,14 @@ INSTANTIATE_TEST_CASE_P(
                              &vpx_highbd_tm_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_tm_predictor_32x32_sse2,
                              &vpx_highbd_tm_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_4x4_sse2,
+                             &vpx_highbd_h_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_8x8_sse2,
+                             &vpx_highbd_h_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_16x16_sse2,
+                             &vpx_highbd_h_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_h_predictor_32x32_sse2,
+                             &vpx_highbd_h_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_4x4_sse2,
                              &vpx_highbd_v_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_v_predictor_8x8_sse2,
diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc
index 464389502..b18d4522c 100644
--- a/libvpx/test/vp9_quantize_test.cc
+++ b/libvpx/test/vp9_quantize_test.cc
@@ -14,9 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
+#include "test/buffer.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
@@ -24,11 +26,12 @@
 #include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using libvpx_test::Buffer;
 
 namespace {
-#if CONFIG_VP9_HIGHBITDEPTH
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
@@ -38,307 +41,494 @@ typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
                              tran_low_t *dqcoeff, const int16_t *dequant,
                              uint16_t *eob, const int16_t *scan,
                              const int16_t *iscan);
-typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t>
+typedef std::tr1::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
+                        int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
-class VP9QuantizeTest : public ::testing::TestWithParam<QuantizeParam> {
+// Wrapper for FP version which does not use zbin or quant_shift.
+typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
+                               int skip_block, const int16_t *round,
+                               const int16_t *quant, tran_low_t *qcoeff,
+                               tran_low_t *dqcoeff, const int16_t *dequant,
+                               uint16_t *eob, const int16_t *scan,
+                               const int16_t *iscan);
+
+template <QuantizeFPFunc fn>
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+                    const int16_t *zbin, const int16_t *round,
+                    const int16_t *quant, const int16_t *quant_shift,
+                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
+                    const int16_t *iscan) {
+  (void)zbin;
+  (void)quant_shift;
+
+  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
+     scan, iscan);
+}
+
+class VP9QuantizeBase {
  public:
-  virtual ~VP9QuantizeTest() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
+  VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
+      : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+    max_value_ = (1 << bit_depth_) - 1;
+    zbin_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
+    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
+    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
+    round_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
+    quant_ptr_ =
+        reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
+    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
+    dequant_ptr_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  ~VP9QuantizeBase() {
+    vpx_free(zbin_ptr_);
+    vpx_free(round_fp_ptr_);
+    vpx_free(quant_fp_ptr_);
+    vpx_free(round_ptr_);
+    vpx_free(quant_ptr_);
+    vpx_free(quant_shift_ptr_);
+    vpx_free(dequant_ptr_);
+    zbin_ptr_ = NULL;
+    round_fp_ptr_ = NULL;
+    quant_fp_ptr_ = NULL;
+    round_ptr_ = NULL;
+    quant_ptr_ = NULL;
+    quant_shift_ptr_ = NULL;
+    dequant_ptr_ = NULL;
+    libvpx_test::ClearSystemState();
+  }
 
  protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
+  int16_t *zbin_ptr_;
+  int16_t *round_fp_ptr_;
+  int16_t *quant_fp_ptr_;
+  int16_t *round_ptr_;
+  int16_t *quant_ptr_;
+  int16_t *quant_shift_ptr_;
+  int16_t *dequant_ptr_;
+  const vpx_bit_depth_t bit_depth_;
+  int max_value_;
+  const int max_size_;
+  const bool is_fp_;
 };
 
-class VP9Quantize32Test : public ::testing::TestWithParam<QuantizeParam> {
+class VP9QuantizeTest : public VP9QuantizeBase,
+                        public ::testing::TestWithParam<QuantizeParam> {
  public:
-  virtual ~VP9Quantize32Test() {}
-  virtual void SetUp() {
-    quantize_op_ = GET_PARAM(0);
-    ref_quantize_op_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(2);
-    mask_ = (1 << bit_depth_) - 1;
-  }
-
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  VP9QuantizeTest()
+      : VP9QuantizeBase(GET_PARAM(2), GET_PARAM(3), GET_PARAM(4)),
+        quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-  QuantizeFunc quantize_op_;
-  QuantizeFunc ref_quantize_op_;
+  const QuantizeFunc quantize_op_;
+  const QuantizeFunc ref_quantize_op_;
 };
 
-TEST_P(VP9QuantizeTest, OperationCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    const TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
+// This quantizer compares the AC coefficients to the quantization step size to
+// determine if further multiplication operations are needed.
+// Based on vp9_quantize_fp_sse2().
+void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      int skip_block, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                      uint16_t *eob_ptr, const int16_t *scan,
+                      const int16_t *iscan) {
+  int i, eob = -1;
+  const int thr = dequant_ptr[1] >> 1;
+  (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i += 16) {
+    int y;
+    int nzflag_cnt = 0;
+    int abs_coeff[16];
+    int coeff_sign[16];
+
+    // count nzflag for each row (16 tran_low_t)
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      const int coeff = coeff_ptr[rc];
+      coeff_sign[y] = (coeff >> 31);
+      abs_coeff[y] = (coeff ^ coeff_sign[y]) - coeff_sign[y];
+      // The first 16 are skipped in the sse2 code.  Do the same here to match.
+      if (i >= 16 && (abs_coeff[y] <= thr)) {
+        nzflag_cnt++;
+      }
     }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
+
+    for (y = 0; y < 16; ++y) {
+      const int rc = i + y;
+      // If all of the AC coeffs in a row has magnitude less than the
+      // quantization step_size/2, quantize to zero.
+      if (nzflag_cnt < 16) {
+        int tmp =
+            clamp(abs_coeff[y] + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+        tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+        qcoeff_ptr[rc] = (tmp ^ coeff_sign[y]) - coeff_sign[y];
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      } else {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+      }
     }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
+  }
+
+  // Scan for eob.
+  for (i = 0; i < n_coeffs; i++) {
+    // Use the scan order to find the correct eob.
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
     }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
+  *eob_ptr = eob + 1;
+}
+
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
+  const int max_qrounding_factor_fp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
 }
 
-TEST_P(VP9Quantize32Test, OperationCheck) {
+TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;
+
   for (int i = 0; i < number_of_iterations; ++i) {
-    const int skip_block = i == 0;
-    const TX_SIZE sz = TX_32X32;
-    const TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    const int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = rnd.Rand16() & mask_;
-    }
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
+    // Test skip block for the first three iterations to catch all the different
+    // sizes.
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
     }
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
+    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+    const int count = (4 << sz) * (4 << sz);
+    coeff.Set(&rnd, -max_value_, max_value_);
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                      scan_order->scan, scan_order->iscan);
+
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
 TEST_P(VP9QuantizeTest, EOBCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[256]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  Buffer<tran_low_t> ref_qcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_qcoeff.Init());
+  Buffer<tran_low_t> ref_dqcoeff =
+      Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(ref_dqcoeff.Init());
+  uint16_t eob, ref_eob;
+
   for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = (TX_SIZE)(i % 3);  // TX_4X4, TX_8X8 TX_16X16
-    TX_TYPE tx_type = (TX_TYPE)((i >> 2) % 3);
+    const int skip_block = 0;
+    TX_SIZE sz;
+    if (max_size_ == 16) {
+      sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
+    } else {
+      sz = TX_32X32;
+    }
+    const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
     const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 16, 64, 256
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
+    int count = (4 << sz) * (4 << sz);
     // Two random entries
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
-
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
+    coeff.Set(0);
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    coeff.TopLeftPixel()[rnd(count)] =
+        static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
+    GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                         quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                         quant_fp_ptr_);
+    int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+    int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+    ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
                      scan_order->scan, scan_order->iscan);
+
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
+        coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+        quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+        dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
 
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
+    EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
+    EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
+
+    EXPECT_EQ(eob, ref_eob);
+
+    if (HasFailure()) {
+      printf("Failure on iteration %d.\n", i);
+      qcoeff.PrintDifference(ref_qcoeff);
+      dqcoeff.PrintDifference(ref_dqcoeff);
+      return;
     }
-    err_count_total += err_count;
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
 
-TEST_P(VP9Quantize32Test, EOBCheck) {
+TEST_P(VP9QuantizeTest, DISABLED_Speed) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED(16, tran_low_t, coeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, zbin_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, round_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_ptr[2]);
-  DECLARE_ALIGNED(16, int16_t, quant_shift_ptr[2]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_qcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, tran_low_t, ref_dqcoeff_ptr[1024]);
-  DECLARE_ALIGNED(16, int16_t, dequant_ptr[2]);
-  DECLARE_ALIGNED(16, uint16_t, eob_ptr[1]);
-  DECLARE_ALIGNED(16, uint16_t, ref_eob_ptr[1]);
-  int err_count_total = 0;
-  int first_failure = -1;
-  for (int i = 0; i < number_of_iterations; ++i) {
-    int skip_block = i == 0;
-    TX_SIZE sz = TX_32X32;
-    TX_TYPE tx_type = (TX_TYPE)(i % 4);
-    const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
-    int count = (4 << sz) * (4 << sz);  // 1024
-    int err_count = 0;
-    *eob_ptr = rnd.Rand16();
-    *ref_eob_ptr = *eob_ptr;
-    for (int j = 0; j < count; j++) {
-      coeff_ptr[j] = 0;
-    }
-    // Two random entries
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    coeff_ptr[rnd(count)] = rnd.Rand16() & mask_;
-    for (int j = 0; j < 2; j++) {
-      zbin_ptr[j] = rnd.Rand16() & mask_;
-      round_ptr[j] = rnd.Rand16();
-      quant_ptr[j] = rnd.Rand16();
-      quant_shift_ptr[j] = rnd.Rand16();
-      dequant_ptr[j] = rnd.Rand16();
-    }
+  Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
+  ASSERT_TRUE(coeff.Init());
+  Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(qcoeff.Init());
+  Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+  ASSERT_TRUE(dqcoeff.Init());
+  uint16_t eob;
+  TX_SIZE starting_sz, ending_sz;
 
-    ref_quantize_op_(coeff_ptr, count, skip_block, zbin_ptr, round_ptr,
-                     quant_ptr, quant_shift_ptr, ref_qcoeff_ptr,
-                     ref_dqcoeff_ptr, dequant_ptr, ref_eob_ptr,
-                     scan_order->scan, scan_order->iscan);
-    ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_ptr, count, skip_block, zbin_ptr, round_ptr, quant_ptr,
-        quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr,
-        scan_order->scan, scan_order->iscan));
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
 
-    for (int j = 0; j < sz; ++j) {
-      err_count += (ref_qcoeff_ptr[j] != qcoeff_ptr[j]) |
-                   (ref_dqcoeff_ptr[j] != dqcoeff_ptr[j]);
-    }
-    err_count += (*ref_eob_ptr != *eob_ptr);
-    if (err_count && !err_count_total) {
-      first_failure = i;
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      const int skip_block = 0;
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
+      const int count = (4 << sz) * (4 << sz);
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+      int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+      int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff.Set(&rnd, -500, 500);
+      }
+
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int j = 0; j < 100000000 / count; ++j) {
+        quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
+                     q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
+                     dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
+                     scan_order->scan, scan_order->iscan);
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      if (i == 0) printf("Bypass calculations.\n");
+      if (i == 1) printf("Full calculations.\n");
+      printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
+             elapsed_time / 1000);
     }
-    err_count_total += err_count;
+    printf("\n");
   }
-  EXPECT_EQ(0, err_count_total)
-      << "Error: Quantization Test, C output doesn't match SSE2 output. "
-      << "First failed at test case " << first_failure;
 }
+
 using std::tr1::make_tuple;
 
 #if HAVE_SSE2
+#if CONFIG_VP9_HIGHBITDEPTH
+// TODO(johannkoenig): Fix vpx_quantize_b_sse2 in highbitdepth builds.
+// make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c, VPX_BITS_8),
 INSTANTIATE_TEST_CASE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_sse2,
-                                 &vpx_highbd_quantize_b_c, VPX_BITS_12)));
+    ::testing::Values(
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+
+#else
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9Quantize32Test,
-    ::testing::Values(make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10),
-                      make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                                 &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12)));
-#endif  // HAVE_SSE2
+    SSE2, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+#if ARCH_X86_64
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
+                                 &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
+                                 16, true)));
+#else
+INSTANTIATE_TEST_CASE_P(SSSE3, VP9QuantizeTest,
+                        ::testing::Values(make_tuple(&vpx_quantize_b_ssse3,
+                                                     &vpx_quantize_b_c,
+                                                     VPX_BITS_8, 16, false)));
+#endif
+
+#if ARCH_X86_64
+// TODO(johannkoenig): SSSE3 optimizations do not yet pass this test.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_SSSE3, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_32x32_ssse3,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // ARCH_X86_64
+#endif  // HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(johannkoenig): AVX optimizations do not yet pass the 32x32 test or
+// highbitdepth configurations.
+#if HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      // Even though SSSE3 and AVX do not match the reference
+                      // code, we can keep them in sync with each other.
+                      make_tuple(&vpx_quantize_b_32x32_avx,
+                                 &vpx_quantize_b_32x32_ssse3, VPX_BITS_8, 32,
+                                 false)));
+#endif  // HAVE_AVX && !CONFIG_VP9_HIGHBITDEPTH
+
+// TODO(webm:1448): dqcoeff is not handled correctly in HBD builds.
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_neon,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                                 &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                                 VPX_BITS_8, 32, true)));
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH
+
+// Only useful to compare "Speed" test results.
+INSTANTIATE_TEST_CASE_P(
+    DISABLED_C, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
+                   32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_c>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
 }  // namespace
diff --git a/libvpx/test/vp9_scale_test.cc b/libvpx/test/vp9_scale_test.cc
new file mode 100644
index 000000000..5d7d38e89
--- /dev/null
+++ b/libvpx/test/vp9_scale_test.cc
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/yv12config.h"
+
+namespace libvpx_test {
+
+typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *dst,
+                               INTERP_FILTER filter_type, int phase_scaler);
+
+class ScaleTest : public VpxScaleBase,
+                  public ::testing::TestWithParam<ScaleFrameFunc> {
+ public:
+  virtual ~ScaleTest() {}
+
+ protected:
+  virtual void SetUp() { scale_fn_ = GetParam(); }
+
+  void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
+  }
+
+  void ScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
+    ASM_REGISTER_STATE_CHECK(
+        scale_fn_(&img_, &dst_img_, filter_type, phase_scaler));
+  }
+
+  void RunTest() {
+    static const int kNumSizesToTest = 20;
+    static const int kNumScaleFactorsToTest = 4;
+    static const int kSizesToTest[] = {
+      2,  4,  6,  8,  10, 12, 14, 16, 18,  20,
+      22, 24, 26, 28, 30, 32, 34, 68, 128, 134
+    };
+    static const int kScaleFactors[] = { 1, 2, 3, 4 };
+    for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+      for (int phase_scaler = 0; phase_scaler < 16; ++phase_scaler) {
+        for (int h = 0; h < kNumSizesToTest; ++h) {
+          const int src_height = kSizesToTest[h];
+          for (int w = 0; w < kNumSizesToTest; ++w) {
+            const int src_width = kSizesToTest[w];
+            for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest;
+                 ++sf_up_idx) {
+              const int sf_up = kScaleFactors[sf_up_idx];
+              for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+                   ++sf_down_idx) {
+                const int sf_down = kScaleFactors[sf_down_idx];
+                const int dst_width = src_width * sf_up / sf_down;
+                const int dst_height = src_height * sf_up / sf_down;
+                if (sf_up == sf_down && sf_up != 1) {
+                  continue;
+                }
+                // I420 frame width and height must be even.
+                if (!dst_width || !dst_height || dst_width & 1 ||
+                    dst_height & 1) {
+                  continue;
+                }
+                // vpx_convolve8_c() has restriction on the step which cannot
+                // exceed 64 (ratio 1 to 4).
+                if (src_width > 4 * dst_width || src_height > 4 * dst_height) {
+                  continue;
+                }
+                ASSERT_NO_FATAL_FAILURE(ResetScaleImages(
+                    src_width, src_height, dst_width, dst_height));
+                ReferenceScaleFrame(filter_type, phase_scaler);
+                ScaleFrame(filter_type, phase_scaler);
+                if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+                           ref_img_.frame_size)) {
+                  printf(
+                      "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+                      "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+                      "scale factor = %d:%d\n",
+                      filter_type, phase_scaler, src_width, src_height,
+                      dst_width, dst_height, sf_down, sf_up);
+                  PrintDiff();
+                }
+                CompareImages(dst_img_);
+                DeallocScaleImages();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrintDiffComponent(const uint8_t *const ref, const uint8_t *const opt,
+                          const int stride, const int width, const int height,
+                          const int plane_idx) const {
+    for (int y = 0; y < height; y++) {
+      for (int x = 0; x < width; x++) {
+        if (ref[y * stride + x] != opt[y * stride + x]) {
+          printf("Plane %d pixel[%d][%d] diff:%6d (ref),%6d (opt)\n", plane_idx,
+                 y, x, ref[y * stride + x], opt[y * stride + x]);
+          break;
+        }
+      }
+    }
+  }
+
+  void PrintDiff() const {
+    assert(ref_img_.y_stride == dst_img_.y_stride);
+    assert(ref_img_.y_width == dst_img_.y_width);
+    assert(ref_img_.y_height == dst_img_.y_height);
+    assert(ref_img_.uv_stride == dst_img_.uv_stride);
+    assert(ref_img_.uv_width == dst_img_.uv_width);
+    assert(ref_img_.uv_height == dst_img_.uv_height);
+
+    if (memcmp(dst_img_.buffer_alloc, ref_img_.buffer_alloc,
+               ref_img_.frame_size)) {
+      PrintDiffComponent(ref_img_.y_buffer, dst_img_.y_buffer,
+                         ref_img_.y_stride, ref_img_.y_width, ref_img_.y_height,
+                         0);
+      PrintDiffComponent(ref_img_.u_buffer, dst_img_.u_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 1);
+      PrintDiffComponent(ref_img_.v_buffer, dst_img_.v_buffer,
+                         ref_img_.uv_stride, ref_img_.uv_width,
+                         ref_img_.uv_height, 2);
+    }
+  }
+
+  ScaleFrameFunc scale_fn_;
+};
+
+TEST_P(ScaleTest, ScaleFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
+
+TEST_P(ScaleTest, DISABLED_Speed) {
+  static const int kCountSpeedTestBlock = 100;
+  static const int kNumScaleFactorsToTest = 4;
+  static const int kScaleFactors[] = { 1, 2, 3, 4 };
+  const int src_width = 1280;
+  const int src_height = 720;
+  for (INTERP_FILTER filter_type = 2; filter_type < 4; ++filter_type) {
+    for (int phase_scaler = 0; phase_scaler < 2; ++phase_scaler) {
+      for (int sf_up_idx = 0; sf_up_idx < kNumScaleFactorsToTest; ++sf_up_idx) {
+        const int sf_up = kScaleFactors[sf_up_idx];
+        for (int sf_down_idx = 0; sf_down_idx < kNumScaleFactorsToTest;
+             ++sf_down_idx) {
+          const int sf_down = kScaleFactors[sf_down_idx];
+          const int dst_width = src_width * sf_up / sf_down;
+          const int dst_height = src_height * sf_up / sf_down;
+          if (sf_up == sf_down && sf_up != 1) {
+            continue;
+          }
+          // I420 frame width and height must be even.
+          if (dst_width & 1 || dst_height & 1) {
+            continue;
+          }
+          ASSERT_NO_FATAL_FAILURE(
+              ResetScaleImages(src_width, src_height, dst_width, dst_height));
+          ASM_REGISTER_STATE_CHECK(
+              ReferenceScaleFrame(filter_type, phase_scaler));
+
+          vpx_usec_timer timer;
+          vpx_usec_timer_start(&timer);
+          for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+            ScaleFrame(filter_type, phase_scaler);
+          }
+          libvpx_test::ClearSystemState();
+          vpx_usec_timer_mark(&timer);
+          const int elapsed_time =
+              static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+          CompareImages(dst_img_);
+          DeallocScaleImages();
+
+          printf(
+              "filter_type = %d, phase_scaler = %d, src_width = %4d, "
+              "src_height = %4d, dst_width = %4d, dst_height = %4d, "
+              "scale factor = %d:%d, scale time: %5d ms\n",
+              filter_type, phase_scaler, src_width, src_height, dst_width,
+              dst_height, sf_down, sf_up, elapsed_time);
+        }
+      }
+    }
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_c));
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(SSSE3, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_ssse3));
+#endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, ScaleTest,
+                        ::testing::Values(vp9_scale_and_extend_frame_neon));
+#endif  // HAVE_NEON
+
+}  // namespace libvpx_test
diff --git a/libvpx/test/vp9_skip_loopfilter_test.cc b/libvpx/test/vp9_skip_loopfilter_test.cc
index e847bbddf..d41a784a2 100644
--- a/libvpx/test/vp9_skip_loopfilter_test.cc
+++ b/libvpx/test/vp9_skip_loopfilter_test.cc
@@ -85,8 +85,8 @@ class SkipLoopFilterTest {
   // TODO(fgalligan): Move the MD5 testing code into another class.
   void OpenMd5File(const std::string &md5_file_name) {
     md5_file_ = libvpx_test::OpenTestDataFile(md5_file_name);
-    ASSERT_TRUE(md5_file_ != NULL) << "MD5 file open failed. Filename: "
-                                   << md5_file_name;
+    ASSERT_TRUE(md5_file_ != NULL)
+        << "MD5 file open failed. Filename: " << md5_file_name;
   }
 
   // Reads the next line of the MD5 file.
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index 19ed30431..62845ad61 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -101,4 +101,9 @@ INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
                         ::testing::Values(vpx_subtract_block_msa));
 #endif
 
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_mmi));
+#endif
+
 }  // namespace vp9
diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
index 3e3fd25ac..576f5e906 100644
--- a/libvpx/test/vp9_thread_test.cc
+++ b/libvpx/test/vp9_thread_test.cc
@@ -187,8 +187,8 @@ void DecodeFiles(const FileList files[]) {
   for (const FileList *iter = files; iter->name != NULL; ++iter) {
     SCOPED_TRACE(iter->name);
     for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t)) << "threads = "
-                                                               << t;
+      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
+          << "threads = " << t;
     }
   }
 }
diff --git a/libvpx/test/vpx_scale_test.cc b/libvpx/test/vpx_scale_test.cc
index 9701d93da..ac75dceb2 100644
--- a/libvpx/test/vpx_scale_test.cc
+++ b/libvpx/test/vpx_scale_test.cc
@@ -14,149 +14,17 @@
 #include "./vpx_scale_rtcd.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/vpx_scale_test.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/yv12config.h"
 
-namespace {
+namespace libvpx_test {
 
 typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
 typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
                               YV12_BUFFER_CONFIG *dst_ybf);
 
-class VpxScaleBase {
- public:
-  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
-
-  void ResetImage(int width, int height) {
-    width_ = width;
-    height_ = height;
-    memset(&img_, 0, sizeof(img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
-    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
-              img_.y_stride);
-    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
-              img_.uv_stride);
-
-    memset(&ref_img_, 0, sizeof(ref_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
-
-    memset(&cpy_img_, 0, sizeof(cpy_img_));
-    ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
-                                             VP8BORDERINPIXELS));
-    memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
-    ReferenceCopyFrame();
-  }
-
-  void DeallocImage() {
-    vp8_yv12_de_alloc_frame_buffer(&img_);
-    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
-    vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
-  }
-
- protected:
-  static const int kBufFiller = 123;
-  static const int kBufMax = kBufFiller - 1;
-
-  static void FillPlane(uint8_t *buf, int width, int height, int stride) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
-      }
-    }
-  }
-
-  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
-                          int width, int height, int stride, int padding) {
-    // Copy the outermost visible pixel to a distance of at least 'padding.'
-    // The buffers are allocated such that there may be excess space outside the
-    // padding. As long as the minimum amount of padding is achieved it is not
-    // necessary to fill this space as well.
-    uint8_t *left = buf - padding;
-    uint8_t *right = buf + crop_width;
-    const int right_extend = padding + (width - crop_width);
-    const int bottom_extend = padding + (height - crop_height);
-
-    // Fill the border pixels from the nearest image pixel.
-    for (int y = 0; y < crop_height; ++y) {
-      memset(left, left[padding], padding);
-      memset(right, right[-1], right_extend);
-      left += stride;
-      right += stride;
-    }
-
-    left = buf - padding;
-    uint8_t *top = left - (stride * padding);
-    // The buffer does not always extend as far as the stride.
-    // Equivalent to padding + width + padding.
-    const int extend_width = padding + crop_width + right_extend;
-
-    // The first row was already extended to the left and right. Copy it up.
-    for (int y = 0; y < padding; ++y) {
-      memcpy(top, left, extend_width);
-      top += stride;
-    }
-
-    uint8_t *bottom = left + (crop_height * stride);
-    for (int y = 0; y < bottom_extend; ++y) {
-      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
-      bottom += stride;
-    }
-  }
-
-  void ReferenceExtendBorder() {
-    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
-                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
-                ref_img_.y_stride, ref_img_.border);
-    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
-                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
-                ref_img_.uv_stride, ref_img_.border / 2);
-  }
-
-  void ReferenceCopyFrame() {
-    // Copy img_ to ref_img_ and extend frame borders. This will be used for
-    // verifying extend_fn_ as well as copy_frame_fn_.
-    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
-    for (int y = 0; y < img_.y_crop_height; ++y) {
-      for (int x = 0; x < img_.y_crop_width; ++x) {
-        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
-            img_.y_buffer[x + y * img_.y_stride];
-      }
-    }
-
-    for (int y = 0; y < img_.uv_crop_height; ++y) {
-      for (int x = 0; x < img_.uv_crop_width; ++x) {
-        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
-            img_.u_buffer[x + y * img_.uv_stride];
-        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
-            img_.v_buffer[x + y * img_.uv_stride];
-      }
-    }
-
-    ReferenceExtendBorder();
-  }
-
-  void CompareImages(const YV12_BUFFER_CONFIG actual) {
-    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
-    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
-                        ref_img_.frame_size));
-  }
-
-  YV12_BUFFER_CONFIG img_;
-  YV12_BUFFER_CONFIG ref_img_;
-  YV12_BUFFER_CONFIG cpy_img_;
-  int width_;
-  int height_;
-};
-
 class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
@@ -178,11 +46,11 @@ class ExtendBorderTest
     static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h]));
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
+        ReferenceCopyFrame();
         ExtendBorder();
-        ReferenceExtendBorder();
         CompareImages(img_);
-        DeallocImage();
+        DeallocImages();
       }
     }
   }
@@ -204,7 +72,7 @@ class CopyFrameTest : public VpxScaleBase,
   virtual void SetUp() { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
-    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+    ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
   }
 
   void RunTest() {
@@ -217,11 +85,11 @@ class CopyFrameTest : public VpxScaleBase,
     static const int kSizesToTest[] = { 1, 15, 33, 145, 512, 1025, 16383 };
     for (int h = 0; h < kNumSizesToTest; ++h) {
       for (int w = 0; w < kNumSizesToTest; ++w) {
-        ASSERT_NO_FATAL_FAILURE(ResetImage(kSizesToTest[w], kSizesToTest[h]));
+        ASSERT_NO_FATAL_FAILURE(ResetImages(kSizesToTest[w], kSizesToTest[h]));
         ReferenceCopyFrame();
         CopyFrame();
-        CompareImages(cpy_img_);
-        DeallocImage();
+        CompareImages(dst_img_);
+        DeallocImages();
       }
     }
   }
@@ -233,4 +101,5 @@ TEST_P(CopyFrameTest, CopyFrame) { ASSERT_NO_FATAL_FAILURE(RunTest()); }
 
 INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
                         ::testing::Values(vp8_yv12_copy_frame_c));
-}  // namespace
+
+}  // namespace libvpx_test
diff --git a/libvpx/test/vpx_scale_test.h b/libvpx/test/vpx_scale_test.h
new file mode 100644
index 000000000..dcbd02b91
--- /dev/null
+++ b/libvpx/test/vpx_scale_test.h
@@ -0,0 +1,200 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_VPX_SCALE_TEST_H_
+#define TEST_VPX_SCALE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_scale_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12config.h"
+
+using libvpx_test::ACMRandom;
+
+namespace libvpx_test {
+
+class VpxScaleBase {
+ public:
+  virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); }
+
+  void ResetImage(YV12_BUFFER_CONFIG *const img, const int width,
+                  const int height) {
+    memset(img, 0, sizeof(*img));
+    ASSERT_EQ(
+        0, vp8_yv12_alloc_frame_buffer(img, width, height, VP8BORDERINPIXELS));
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetImages(const int width, const int height) {
+    ResetImage(&img_, width, height);
+    ResetImage(&ref_img_, width, height);
+    ResetImage(&dst_img_, width, height);
+
+    FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+              img_.y_stride);
+    FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+    FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+              img_.uv_stride);
+  }
+
+  void ResetScaleImage(YV12_BUFFER_CONFIG *const img, const int width,
+                       const int height) {
+    memset(img, 0, sizeof(*img));
+#if CONFIG_VP9_HIGHBITDEPTH
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1, 0,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#else
+    ASSERT_EQ(0, vpx_alloc_frame_buffer(img, width, height, 1, 1,
+                                        VP9_ENC_BORDER_IN_PIXELS, 0));
+#endif
+    memset(img->buffer_alloc, kBufFiller, img->frame_size);
+  }
+
+  void ResetScaleImages(const int src_width, const int src_height,
+                        const int dst_width, const int dst_height) {
+    ResetScaleImage(&img_, src_width, src_height);
+    ResetScaleImage(&ref_img_, dst_width, dst_height);
+    ResetScaleImage(&dst_img_, dst_width, dst_height);
+    FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+                     img_.y_stride);
+    FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+    FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+                     img_.uv_stride);
+  }
+
+  void DeallocImages() {
+    vp8_yv12_de_alloc_frame_buffer(&img_);
+    vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+    vp8_yv12_de_alloc_frame_buffer(&dst_img_);
+  }
+
+  void DeallocScaleImages() {
+    vpx_free_frame_buffer(&img_);
+    vpx_free_frame_buffer(&ref_img_);
+    vpx_free_frame_buffer(&dst_img_);
+  }
+
+ protected:
+  static const int kBufFiller = 123;
+  static const int kBufMax = kBufFiller - 1;
+
+  static void FillPlane(uint8_t *const buf, const int width, const int height,
+                        const int stride) {
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
+      }
+    }
+  }
+
+  static void FillPlaneExtreme(uint8_t *const buf, const int width,
+                               const int height, const int stride) {
+    ACMRandom rnd;
+    for (int y = 0; y < height; ++y) {
+      for (int x = 0; x < width; ++x) {
+        buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0;
+      }
+    }
+  }
+
+  static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
+                          int width, int height, int stride, int padding) {
+    // Copy the outermost visible pixel to a distance of at least 'padding.'
+    // The buffers are allocated such that there may be excess space outside the
+    // padding. As long as the minimum amount of padding is achieved it is not
+    // necessary to fill this space as well.
+    uint8_t *left = buf - padding;
+    uint8_t *right = buf + crop_width;
+    const int right_extend = padding + (width - crop_width);
+    const int bottom_extend = padding + (height - crop_height);
+
+    // Fill the border pixels from the nearest image pixel.
+    for (int y = 0; y < crop_height; ++y) {
+      memset(left, left[padding], padding);
+      memset(right, right[-1], right_extend);
+      left += stride;
+      right += stride;
+    }
+
+    left = buf - padding;
+    uint8_t *top = left - (stride * padding);
+    // The buffer does not always extend as far as the stride.
+    // Equivalent to padding + width + padding.
+    const int extend_width = padding + crop_width + right_extend;
+
+    // The first row was already extended to the left and right. Copy it up.
+    for (int y = 0; y < padding; ++y) {
+      memcpy(top, left, extend_width);
+      top += stride;
+    }
+
+    uint8_t *bottom = left + (crop_height * stride);
+    for (int y = 0; y < bottom_extend; ++y) {
+      memcpy(bottom, left + (crop_height - 1) * stride, extend_width);
+      bottom += stride;
+    }
+  }
+
+  void ReferenceExtendBorder() {
+    ExtendPlane(ref_img_.y_buffer, ref_img_.y_crop_width,
+                ref_img_.y_crop_height, ref_img_.y_width, ref_img_.y_height,
+                ref_img_.y_stride, ref_img_.border);
+    ExtendPlane(ref_img_.u_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+    ExtendPlane(ref_img_.v_buffer, ref_img_.uv_crop_width,
+                ref_img_.uv_crop_height, ref_img_.uv_width, ref_img_.uv_height,
+                ref_img_.uv_stride, ref_img_.border / 2);
+  }
+
+  void ReferenceCopyFrame() {
+    // Copy img_ to ref_img_ and extend frame borders. This will be used for
+    // verifying extend_fn_ as well as copy_frame_fn_.
+    EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
+    for (int y = 0; y < img_.y_crop_height; ++y) {
+      for (int x = 0; x < img_.y_crop_width; ++x) {
+        ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+            img_.y_buffer[x + y * img_.y_stride];
+      }
+    }
+
+    for (int y = 0; y < img_.uv_crop_height; ++y) {
+      for (int x = 0; x < img_.uv_crop_width; ++x) {
+        ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+            img_.u_buffer[x + y * img_.uv_stride];
+        ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+            img_.v_buffer[x + y * img_.uv_stride];
+      }
+    }
+
+    ReferenceExtendBorder();
+  }
+
+  void CompareImages(const YV12_BUFFER_CONFIG actual) {
+    EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+    EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
+                        ref_img_.frame_size));
+  }
+
+  YV12_BUFFER_CONFIG img_;
+  YV12_BUFFER_CONFIG ref_img_;
+  YV12_BUFFER_CONFIG dst_img_;
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_VPX_SCALE_TEST_H_
diff --git a/libvpx/test/vpx_temporal_svc_encoder.sh b/libvpx/test/vpx_temporal_svc_encoder.sh
index 3d5152ae3..56a7902f4 100755
--- a/libvpx/test/vpx_temporal_svc_encoder.sh
+++ b/libvpx/test/vpx_temporal_svc_encoder.sh
@@ -52,11 +52,19 @@ vpx_tsvc_encoder() {
 
   # TODO(tomfinegan): Verify file output for all thread runs.
   for threads in $(seq $max_threads); do
-    eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" "${output_file}" \
-        "${codec}" "${YUV_RAW_INPUT_WIDTH}" "${YUV_RAW_INPUT_HEIGHT}" \
-        "${timebase_num}" "${timebase_den}" "${speed}" "${frame_drop_thresh}" \
-        "${error_resilient}" "${threads}" "$@" \
-        ${devnull}
+    if [ "$(vpx_config_option_enabled CONFIG_VP9_HIGHBITDEPTH)" != "yes" ]; then
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" ${devnull}
+    else
+      eval "${VPX_TEST_PREFIX}" "${encoder}" "${YUV_RAW_INPUT}" \
+        "${output_file}" "${codec}" "${YUV_RAW_INPUT_WIDTH}" \
+        "${YUV_RAW_INPUT_HEIGHT}" "${timebase_num}" "${timebase_den}" \
+        "${speed}" "${frame_drop_thresh}" "${error_resilient}" "${threads}" \
+        "$@" "8" ${devnull}
+    fi
   done
 }
 
diff --git a/libvpx/test/vpxenc.sh b/libvpx/test/vpxenc.sh
index e8994992a..0c160dafc 100755
--- a/libvpx/test/vpxenc.sh
+++ b/libvpx/test/vpxenc.sh
@@ -90,6 +90,15 @@ vpxenc_rt_params() {
     --undershoot-pct=50"
 }
 
+# Forces --passes to 1 with CONFIG_REALTIME_ONLY.
+vpxenc_passes_param() {
+  if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" = "yes" ]; then
+    echo "--passes=1"
+  else
+    echo "--passes=2"
+  fi
+}
+
 # Wrapper function for running vpxenc with pipe input. Requires that
 # LIBVPX_BIN_PATH points to the directory containing vpxenc. $1 is used as the
 # input file path and shifted away. All remaining parameters are passed through
@@ -218,9 +227,11 @@ vpxenc_vp8_ivf_piped_input() {
 vpxenc_vp9_ivf() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --ivf \
       --output="${output}"
 
@@ -235,9 +246,11 @@ vpxenc_vp9_webm() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -339,11 +352,13 @@ vpxenc_vp9_webm_2pass() {
 vpxenc_vp9_ivf_lossless() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
+      "${passes}" \
       --lossless=1
 
     if [ ! -e "${output}" ]; then
@@ -356,11 +371,13 @@ vpxenc_vp9_ivf_lossless() {
 vpxenc_vp9_ivf_minq0_maxq0() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lossless_minq0_maxq0.ivf"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
       --ivf \
       --output="${output}" \
+      "${passes}" \
       --min-q=0 \
       --max-q=0
 
@@ -377,12 +394,13 @@ vpxenc_vp9_webm_lag10_frames20() {
     local readonly lag_total_frames=20
     local readonly lag_frames=10
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_lag10_frames20.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(yuv_input_hantro_collage) \
       --codec=vp9 \
       --limit="${lag_total_frames}" \
       --lag-in-frames="${lag_frames}" \
       --output="${output}" \
-      --passes=2 \
+      "${passes}" \
       --auto-alt-ref=1
 
     if [ ! -e "${output}" ]; then
@@ -397,9 +415,11 @@ vpxenc_vp9_webm_non_square_par() {
   if [ "$(vpxenc_can_encode_vp9)" = "yes" ] && \
      [ "$(webm_io_available)" = "yes" ]; then
     local readonly output="${VPX_TEST_OUTPUT_DIR}/vp9_non_square_par.webm"
+    local readonly passes=$(vpxenc_passes_param)
     vpxenc $(y4m_input_non_square_par) \
       --codec=vp9 \
       --limit="${TEST_FRAMES}" \
+      "${passes}" \
       --output="${output}"
 
     if [ ! -e "${output}" ]; then
@@ -412,18 +432,21 @@ vpxenc_vp9_webm_non_square_par() {
 vpxenc_tests="vpxenc_vp8_ivf
               vpxenc_vp8_webm
               vpxenc_vp8_webm_rt
-              vpxenc_vp8_webm_2pass
-              vpxenc_vp8_webm_lag10_frames20
               vpxenc_vp8_ivf_piped_input
               vpxenc_vp9_ivf
               vpxenc_vp9_webm
               vpxenc_vp9_webm_rt
               vpxenc_vp9_webm_rt_multithread_tiled
               vpxenc_vp9_webm_rt_multithread_tiled_frameparallel
-              vpxenc_vp9_webm_2pass
               vpxenc_vp9_ivf_lossless
               vpxenc_vp9_ivf_minq0_maxq0
               vpxenc_vp9_webm_lag10_frames20
               vpxenc_vp9_webm_non_square_par"
+if [ "$(vpx_config_option_enabled CONFIG_REALTIME_ONLY)" != "yes" ]; then
+  vpxenc_tests="$vpxenc_tests
+                vpxenc_vp8_webm_2pass
+                vpxenc_vp8_webm_lag10_frames20
+                vpxenc_vp9_webm_2pass"
+fi
 
 run_tests vpxenc_verify_environment "${vpxenc_tests}"
diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index 53713618e..09c007a3f 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h
@@ -40,8 +40,8 @@ class WebMVideoSource : public CompressedVideoSource {
 
   virtual void Begin() {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL) << "Input file open failed. Filename: "
-                                        << file_name_;
+    ASSERT_TRUE(vpx_ctx_->file != NULL)
+        << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
 
diff --git a/libvpx/test/y4m_video_source.h b/libvpx/test/y4m_video_source.h
index 2682ddde3..1301f6970 100644
--- a/libvpx/test/y4m_video_source.h
+++ b/libvpx/test/y4m_video_source.h
@@ -34,8 +34,8 @@ class Y4mVideoSource : public VideoSource {
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
diff --git a/libvpx/test/yuv_video_source.h b/libvpx/test/yuv_video_source.h
index 71ad2ab9a..aee6b2ffb 100644
--- a/libvpx/test/yuv_video_source.h
+++ b/libvpx/test/yuv_video_source.h
@@ -43,8 +43,8 @@ class YUVVideoSource : public VideoSource {
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: "
-                                     << file_name_;
+    ASSERT_TRUE(input_file_ != NULL)
+        << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
     }
diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx
index 3d9938096..2cd6910b4 100644
--- a/libvpx/third_party/googletest/README.libvpx
+++ b/libvpx/third_party/googletest/README.libvpx
@@ -20,3 +20,5 @@ Local Modifications:
    LICENSE
    README.md
    src
+- Suppress unsigned overflow instrumentation in the LCG
+  https://github.com/google/googletest/pull/1066
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 0094ed507..da57e65d3 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -985,6 +985,19 @@ using ::std::tuple_size;
 # define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
+// A function level attribute to disable UndefinedBehaviorSanitizer's (defined)
+// unsigned integer overflow instrumentation.
+#if defined(__clang__)
+# if defined(__has_attribute) && __has_attribute(no_sanitize)
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_ \
+       __attribute__((no_sanitize("unsigned-integer-overflow")))
+# else
+#  define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+# endif  // defined(__has_attribute) && __has_attribute(no_sanitize)
+#else
+# define GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
+#endif  // __clang__
+
 namespace testing {
 
 class Message;
diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc
index d882ab2e3..5a8932c73 100644
--- a/libvpx/third_party/googletest/src/src/gtest.cc
+++ b/libvpx/third_party/googletest/src/src/gtest.cc
@@ -308,6 +308,7 @@ namespace internal {
 // Generates a random number from [0, range), using a Linear
 // Congruential Generator (LCG).  Crashes if 'range' is 0 or greater
 // than kMaxRange.
+GTEST_ATTRIBUTE_NO_SANITIZE_UNSIGNED_OVERFLOW_
 UInt32 Random::Generate(UInt32 range) {
   // These constants are the same as are used in glibc's rand(3).
   state_ = (1103515245U*state_ + 12345U) % kMaxRange;
diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx
index 1f8a13d78..ebb5ff2f4 100644
--- a/libvpx/third_party/libwebm/README.libvpx
+++ b/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 9732ae991efb71aced4267d4794918279e362d99
+Version: 0ae757087f5e6eb01dfea16cc09205b2425cfb74
 License: BSD
 License File: LICENSE.txt
 
diff --git a/libvpx/third_party/libwebm/common/hdr_util.h b/libvpx/third_party/libwebm/common/hdr_util.h
index 689fb30a3..3ef5388fd 100644
--- a/libvpx/third_party/libwebm/common/hdr_util.h
+++ b/libvpx/third_party/libwebm/common/hdr_util.h
@@ -47,7 +47,15 @@ struct Vp9CodecFeatures {
   int chroma_subsampling;
 };
 
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
 typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr;
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic pop
+#endif
 
 bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc,
                              PrimaryChromaticityPtr* muxer_pc);
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 299b45c98..15b9a908d 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -24,6 +24,11 @@
 #include "mkvmuxer/mkvwriter.h"
 #include "mkvparser/mkvparser.h"
 
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
 namespace mkvmuxer {
 
 const float PrimaryChromaticity::kChromaticityMin = 0.0f;
@@ -3053,7 +3058,7 @@ Segment::Segment()
       output_cues_(true),
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
-      estimate_file_duration_(true),
+      estimate_file_duration_(false),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -3361,7 +3366,10 @@ uint64_t Segment::AddVideoTrack(int32_t width, int32_t height, int32_t number) {
   track->set_width(width);
   track->set_height(height);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
   has_video_ = true;
 
   return track->number();
@@ -3383,8 +3391,10 @@ bool Segment::AddCuePoint(uint64_t timestamp, uint64_t track) {
   cue->set_block_number(cluster->blocks_added());
   cue->set_cluster_pos(cluster->position_for_cues());
   cue->set_track(track);
-  if (!cues_.AddCue(cue))
+  if (!cues_.AddCue(cue)) {
+    delete cue;
     return false;
+  }
 
   new_cuepoint_ = false;
   return true;
@@ -3401,7 +3411,10 @@ uint64_t Segment::AddAudioTrack(int32_t sample_rate, int32_t channels,
   track->set_sample_rate(sample_rate);
   track->set_channels(channels);
 
-  tracks_.AddTrack(track, number);
+  if (!tracks_.AddTrack(track, number)) {
+    delete track;
+    return 0;
+  }
 
   return track->number();
 }
@@ -3490,16 +3503,33 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (frame->discard_padding() != 0)
     doc_type_version_ = 4;
 
+  if (cluster_list_size_ > 0) {
+    const uint64_t timecode_scale = segment_info_.timecode_scale();
+    const uint64_t frame_timecode = frame->timestamp() / timecode_scale;
+
+    const Cluster* const last_cluster = cluster_list_[cluster_list_size_ - 1];
+    const uint64_t last_cluster_timecode = last_cluster->timecode();
+
+    const uint64_t rel_timecode = frame_timecode - last_cluster_timecode;
+    if (rel_timecode > kMaxBlockTimecode) {
+      force_new_cluster_ = true;
+    }
+  }
+
   // If the segment has a video track hold onto audio frames to make sure the
   // audio that is associated with the start time of a video key-frame is
   // muxed into the same cluster.
   if (has_video_ && tracks_.TrackIsAudio(frame->track_number()) &&
       !force_new_cluster_) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame || !new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
-    if (!QueueFrame(new_frame))
+    }
+    if (!QueueFrame(new_frame)) {
+      delete new_frame;
       return false;
+    }
     track_frames_written_[frame->track_number() - 1]++;
     return true;
   }
@@ -3522,8 +3552,10 @@ bool Segment::AddGenericFrame(const Frame* frame) {
   if (!frame->CanBeSimpleBlock() && !frame->is_key() &&
       !frame->reference_block_timestamp_set()) {
     Frame* const new_frame = new (std::nothrow) Frame();
-    if (!new_frame->CopyFrom(*frame))
+    if (!new_frame || !new_frame->CopyFrom(*frame)) {
+      delete new_frame;
       return false;
+    }
     new_frame->set_reference_block_timestamp(
         last_track_timestamp_[frame->track_number() - 1]);
     frame = new_frame;
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 1ba17ac1b..355d4e22b 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -10,6 +10,7 @@
 
 #ifdef __ANDROID__
 #include <fcntl.h>
+#include <unistd.h>
 #endif
 
 #include <cassert>
@@ -288,7 +289,7 @@ uint64 EbmlElementSize(uint64 type, const char* value) {
   ebml_size += strlen(value);
 
   // Size of Datasize
-  ebml_size++;
+  ebml_size += GetCodedUIntSize(strlen(value));
 
   return ebml_size;
 }
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
index ec34e4df8..84655d802 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc
@@ -8,6 +8,8 @@
 
 #include "mkvmuxer/mkvwriter.h"
 
+#include <sys/types.h>
+
 #ifdef _MSC_VER
 #include <share.h>  // for _SH_DENYWR
 #endif
diff --git a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index e62d6f607..37f230d0a 100644
--- a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -22,6 +22,11 @@
 
 #include "common/webmids.h"
 
+// disable deprecation warnings for auto_ptr
+#if defined(__GNUC__) && __GNUC__ >= 5
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
+#endif
+
 namespace mkvparser {
 const float MasteringMetadata::kValueNotPresent = FLT_MAX;
 const long long Colour::kValueNotPresent = LLONG_MAX;
@@ -1528,15 +1533,19 @@ long SeekHead::Parse() {
   if (pos != stop)
     return E_FILE_FORMAT_INVALID;
 
-  m_entries = new (std::nothrow) Entry[entry_count];
+  if (entry_count > 0) {
+    m_entries = new (std::nothrow) Entry[entry_count];
 
-  if (m_entries == NULL)
-    return -1;
+    if (m_entries == NULL)
+      return -1;
+  }
 
-  m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+  if (void_element_count > 0) {
+    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
 
-  if (m_void_elements == NULL)
-    return -1;
+    if (m_void_elements == NULL)
+      return -1;
+  }
 
   // now parse the entries and void elements
 
@@ -1555,14 +1564,14 @@ long SeekHead::Parse() {
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek) {
+    if (id == libwebm::kMkvSeek && entry_count > 0) {
       if (ParseEntry(pReader, pos, size, pEntry)) {
         Entry& e = *pEntry++;
 
         e.element_start = idpos;
         e.element_size = (pos + size) - idpos;
       }
-    } else if (id == libwebm::kMkvVoid) {
+    } else if (id == libwebm::kMkvVoid && void_element_count > 0) {
       VoidElement& e = *pVoidElement++;
 
       e.element_start = idpos;
@@ -2426,7 +2435,9 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
 }
 
 const CuePoint::TrackPosition* CuePoint::Find(const Track* pTrack) const {
-  assert(pTrack);
+  if (pTrack == NULL) {
+    return NULL;
+  }
 
   const long long n = pTrack->GetNumber();
 
@@ -4026,7 +4037,7 @@ long SegmentInfo::Parse() {
   }
 
   const double rollover_check = m_duration * m_timecodeScale;
-  if (rollover_check > LLONG_MAX)
+  if (rollover_check > static_cast<double>(LLONG_MAX))
     return E_FILE_FORMAT_INVALID;
 
   if (pos != stop)
@@ -4975,29 +4986,27 @@ bool PrimaryChromaticity::Parse(IMkvReader* reader, long long read_pos,
   if (!reader)
     return false;
 
-  std::auto_ptr<PrimaryChromaticity> chromaticity_ptr;
-
-  if (!*chromaticity) {
-    chromaticity_ptr.reset(new PrimaryChromaticity());
-  } else {
-    chromaticity_ptr.reset(*chromaticity);
-  }
+  if (!*chromaticity)
+    *chromaticity = new PrimaryChromaticity();
 
-  if (!chromaticity_ptr.get())
+  if (!*chromaticity)
     return false;
 
-  float* value = is_x ? &chromaticity_ptr->x : &chromaticity_ptr->y;
+  PrimaryChromaticity* pc = *chromaticity;
+  float* value = is_x ? &pc->x : &pc->y;
 
   double parser_value = 0;
-  const long long value_parse_status =
+  const long long parse_status =
       UnserializeFloat(reader, read_pos, value_size, parser_value);
 
-  *value = static_cast<float>(parser_value);
-
-  if (value_parse_status < 0 || *value < 0.0 || *value > 1.0)
+  // Valid range is [0, 1]. Make sure the double is representable as a float
+  // before casting.
+  if (parse_status < 0 || parser_value < 0.0 || parser_value > 1.0 ||
+      (parser_value > 0.0 && parser_value < FLT_MIN))
     return false;
 
-  *chromaticity = chromaticity_ptr.release();
+  *value = static_cast<float>(parser_value);
+
   return true;
 }
 
@@ -5228,7 +5237,9 @@ bool Projection::Parse(IMkvReader* reader, long long start, long long size,
       double value = 0;
       const long long value_parse_status =
           UnserializeFloat(reader, read_pos, child_size, value);
-      if (value_parse_status < 0) {
+      // Make sure value is representable as a float before casting.
+      if (value_parse_status < 0 || value < -FLT_MAX || value > FLT_MAX ||
+          (value > 0.0 && value < FLT_MIN)) {
         return false;
       }
 
@@ -7932,7 +7943,6 @@ long Block::Parse(const Cluster* pCluster) {
     pf = m_frames;
     while (pf != pf_end) {
       Frame& f = *pf++;
-      assert((pos + f.len) <= stop);
       if ((pos + f.len) > stop)
         return E_FILE_FORMAT_INVALID;
 
diff --git a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
index b8fd00c26..23d68f508 100644
--- a/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
+++ b/libvpx/third_party/libwebm/mkvparser/mkvreader.cc
@@ -7,6 +7,8 @@
 // be found in the AUTHORS file in the root of the source tree.
 #include "mkvparser/mkvreader.h"
 
+#include <sys/types.h>
+
 #include <cassert>
 
 namespace mkvparser {
diff --git a/libvpx/tools.mk b/libvpx/tools.mk
index 23adcee6e..1d005b2ac 100644
--- a/libvpx/tools.mk
+++ b/libvpx/tools.mk
@@ -13,6 +13,8 @@ TOOLS-yes            += tiny_ssim.c
 tiny_ssim.SRCS       += vpx/vpx_integer.h y4minput.c y4minput.h \
                         vpx/vpx_codec.h vpx/src/vpx_image.c
 tiny_ssim.SRCS       += vpx_mem/vpx_mem.c vpx_mem/vpx_mem.h
+tiny_ssim.SRCS       += vpx_dsp/ssim.h vpx_scale/yv12config.h
+tiny_ssim.SRCS       += vpx_ports/mem.h vpx_ports/mem.h
 tiny_ssim.SRCS       += vpx_mem/include/vpx_mem_intrnl.h
 tiny_ssim.GUID        = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4
 tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files
diff --git a/libvpx/tools/all_builds.py b/libvpx/tools/all_builds.py
deleted file mode 100755
index d1f0c80c0..000000000
--- a/libvpx/tools/all_builds.py
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/usr/bin/python
-
-import getopt
-import subprocess
-import sys
-
-LONG_OPTIONS = ["shard=", "shards="]
-BASE_COMMAND = "./configure --enable-internal-stats --enable-experimental"
-
-def RunCommand(command):
-  run = subprocess.Popen(command, shell=True)
-  output = run.communicate()
-  if run.returncode:
-    print "Non-zero return code: " + str(run.returncode) + " => exiting!"
-    sys.exit(1)
-
-def list_of_experiments():
-  experiments = []
-  configure_file = open("configure")
-  list_start = False
-  for line in configure_file.read().split("\n"):
-    if line == 'EXPERIMENT_LIST="':
-      list_start = True
-    elif line == '"':
-      list_start = False
-    elif list_start:
-      currently_broken = ["csm"]
-      experiment = line[4:]
-      if experiment not in currently_broken:
-        experiments.append(experiment)
-  return experiments
-
-def main(argv):
-  # Parse arguments
-  options = {"--shard": 0, "--shards": 1}
-  if "--" in argv:
-    opt_end_index = argv.index("--")
-  else:
-    opt_end_index = len(argv)
-  try:
-    o, _ = getopt.getopt(argv[1:opt_end_index], None, LONG_OPTIONS)
-  except getopt.GetoptError, err:
-    print str(err)
-    print "Usage: %s [--shard=<n> --shards=<n>] -- [configure flag ...]"%argv[0]
-    sys.exit(2)
-
-  options.update(o)
-  extra_args = argv[opt_end_index + 1:]
-
-  # Shard experiment list
-  shard = int(options["--shard"])
-  shards = int(options["--shards"])
-  experiments = list_of_experiments()
-  base_command = " ".join([BASE_COMMAND] + extra_args)
-  configs = [base_command]
-  configs += ["%s --enable-%s" % (base_command, e) for e in experiments]
-  my_configs = zip(configs, range(len(configs)))
-  my_configs = filter(lambda x: x[1] % shards == shard, my_configs)
-  my_configs = [e[0] for e in my_configs]
-
-  # Run configs for this shard
-  for config in my_configs:
-    test_build(config)
-
-def test_build(configure_command):
-  print "\033[34m\033[47mTesting %s\033[0m" % (configure_command)
-  RunCommand(configure_command)
-  RunCommand("make clean")
-  RunCommand("make")
-
-if __name__ == "__main__":
-  main(sys.argv)
diff --git a/libvpx/tools/author_first_release.sh b/libvpx/tools/author_first_release.sh
deleted file mode 100755
index 7b0b79721..000000000
--- a/libvpx/tools/author_first_release.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-##
-## List the release each author first contributed to.
-##
-## Usage: author_first_release.sh [TAGS]
-##
-## If the TAGS arguments are unspecified, all tags reported by `git tag`
-## will be considered.
-##
-tags=${@:-$(git tag)}
-for tag in $tags; do
-  git shortlog -n -e -s $tag |
-      cut -f2- |
-      awk "{print \"${tag#v}\t\"\$0}"
-done | sort -k2  | uniq -f2
diff --git a/libvpx/tools/ftfy.sh b/libvpx/tools/ftfy.sh
deleted file mode 100755
index c005918fe..000000000
--- a/libvpx/tools/ftfy.sh
+++ /dev/null
@@ -1,158 +0,0 @@
-#!/bin/sh
-self="$0"
-dirname_self=$(dirname "$self")
-
-usage() {
-  cat <<EOF >&2
-Usage: $self [option]
-
-This script applies a whitespace transformation to the commit at HEAD. If no
-options are given, then the modified files are left in the working tree.
-
-Options:
-  -h, --help     Shows this message
-  -n, --dry-run  Shows a diff of the changes to be made.
-  --amend        Squashes the changes into the commit at HEAD
-                     This option will also reformat the commit message.
-  --commit       Creates a new commit containing only the whitespace changes
-  --msg-only     Reformat the commit message only, ignore the patch itself.
-
-EOF
-  rm -f ${CLEAN_FILES}
-  exit 1
-}
-
-
-log() {
-  echo "${self##*/}: $@" >&2
-}
-
-
-vpx_style() {
-  for f; do
-    case "$f" in
-      *.h|*.c|*.cc)
-        clang-format -i --style=file "$f"
-        ;;
-    esac
-  done
-}
-
-
-apply() {
-  [ $INTERSECT_RESULT -ne 0 ] && patch -p1 < "$1"
-}
-
-
-commit() {
-  LAST_CHANGEID=$(git show | awk '/Change-Id:/{print $2}')
-  if [ -z "$LAST_CHANGEID" ]; then
-    log "HEAD doesn't have a Change-Id, unable to generate a new commit"
-    exit 1
-  fi
-
-  # Build a deterministic Change-Id from the parent's
-  NEW_CHANGEID=${LAST_CHANGEID}-styled
-  NEW_CHANGEID=I$(echo $NEW_CHANGEID | git hash-object --stdin)
-
-  # Commit, preserving authorship from the parent commit.
-  git commit -a -C HEAD > /dev/null
-  git commit --amend -F- << EOF
-Cosmetic: Fix whitespace in change ${LAST_CHANGEID:0:9}
-
-Change-Id: ${NEW_CHANGEID}
-EOF
-}
-
-
-show_commit_msg_diff() {
-  if [ $DIFF_MSG_RESULT -ne 0 ]; then
-    log "Modified commit message:"
-    diff -u "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG" | tail -n +3
-  fi
-}
-
-
-amend() {
-  show_commit_msg_diff
-  if [ $DIFF_MSG_RESULT -ne 0 ] || [ $INTERSECT_RESULT -ne 0 ]; then
-    git commit -a --amend -F "$NEW_COMMIT_MSG"
-  fi
-}
-
-
-diff_msg() {
-  git log -1 --format=%B > "$ORIG_COMMIT_MSG"
-  "${dirname_self}"/wrap-commit-msg.py \
-      < "$ORIG_COMMIT_MSG" > "$NEW_COMMIT_MSG"
-  cmp -s "$ORIG_COMMIT_MSG" "$NEW_COMMIT_MSG"
-  DIFF_MSG_RESULT=$?
-}
-
-
-# Temporary files
-ORIG_DIFF=orig.diff.$$
-MODIFIED_DIFF=modified.diff.$$
-FINAL_DIFF=final.diff.$$
-ORIG_COMMIT_MSG=orig.commit-msg.$$
-NEW_COMMIT_MSG=new.commit-msg.$$
-CLEAN_FILES="${ORIG_DIFF} ${MODIFIED_DIFF} ${FINAL_DIFF}"
-CLEAN_FILES="${CLEAN_FILES} ${ORIG_COMMIT_MSG} ${NEW_COMMIT_MSG}"
-
-# Preconditions
-[ $# -lt 2 ] || usage
-
-if ! clang-format -version >/dev/null 2>&1; then
-  log "clang-format not found"
-  exit 1
-fi
-
-if ! git diff --quiet HEAD; then
-  log "Working tree is dirty, commit your changes first"
-  exit 1
-fi
-
-# Need to be in the root
-cd "$(git rev-parse --show-toplevel)"
-
-# Collect the original diff
-git show > "${ORIG_DIFF}"
-
-# Apply the style guide on new and modified files and collect its diff
-for f in $(git diff HEAD^ --name-only -M90 --diff-filter=AM); do
-  case "$f" in
-    third_party/*) continue;;
-  esac
-  vpx_style "$f"
-done
-git diff --no-color --no-ext-diff > "${MODIFIED_DIFF}"
-
-# Intersect the two diffs
-"${dirname_self}"/intersect-diffs.py \
-    "${ORIG_DIFF}" "${MODIFIED_DIFF}" > "${FINAL_DIFF}"
-INTERSECT_RESULT=$?
-git reset --hard >/dev/null
-
-# Fixup the commit message
-diff_msg
-
-# Handle options
-if [ -n "$1" ]; then
-  case "$1" in
-    -h|--help) usage;;
-    -n|--dry-run) cat "${FINAL_DIFF}"; show_commit_msg_diff;;
-    --commit) apply "${FINAL_DIFF}"; commit;;
-    --amend) apply "${FINAL_DIFF}"; amend;;
-    --msg-only) amend;;
-    *) usage;;
-  esac
-else
-  apply "${FINAL_DIFF}"
-  if ! git diff --quiet; then
-    log "Formatting changes applied, verify and commit."
-    log "See also: http://www.webmproject.org/code/contribute/conventions/"
-    git diff --stat
-  fi
-fi
-
-rm -f ${CLEAN_FILES}
diff --git a/libvpx/tools/tiny_ssim.c b/libvpx/tools/tiny_ssim.c
index 1f6a448bc..5e8ca02b4 100644
--- a/libvpx/tools/tiny_ssim.c
+++ b/libvpx/tools/tiny_ssim.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <errno.h>
 #include <math.h>
 #include <stdio.h>
@@ -16,73 +17,36 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "./y4minput.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static uint64_t calc_plane_error16(uint16_t *orig, int orig_stride,
+                                   uint16_t *recon, int recon_stride,
+                                   unsigned int cols, unsigned int rows) {
+  unsigned int row, col;
+  uint64_t total_sse = 0;
+  int diff;
 
-void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp,
-                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
-                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
-  int i, j;
-  for (i = 0; i < 8; i++, s += sp, r += rp) {
-    for (j = 0; j < 8; j++) {
-      *sum_s += s[j];
-      *sum_r += r[j];
-      *sum_sq_s += s[j] * s[j];
-      *sum_sq_r += r[j] * r[j];
-      *sum_sxr += s[j] * r[j];
+  for (row = 0; row < rows; row++) {
+    for (col = 0; col < cols; col++) {
+      diff = orig[col] - recon[col];
+      total_sse += diff * diff;
     }
-  }
-}
-
-static const int64_t cc1 = 26634;   // (64^2*(.01*255)^2
-static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
 
-static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
-                         uint32_t sum_sq_r, uint32_t sum_sxr, int count) {
-  int64_t ssim_n, ssim_d;
-  int64_t c1, c2;
-
-  // scale the constants by number of pixels
-  c1 = (cc1 * count * count) >> 12;
-  c2 = (cc2 * count * count) >> 12;
-
-  ssim_n = (2 * sum_s * sum_r + c1) *
-           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
-
-  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
-           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
-            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
-
-  return ssim_n * 1.0 / ssim_d;
-}
-
-static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) {
-  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
-                       &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
-}
-
-// We are using a 8x8 moving window with starting location of each 8x8 window
-// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
-// block boundaries to penalize blocking artifacts.
-double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1,
-                 int stride_img2, int width, int height) {
-  int i, j;
-  int samples = 0;
-  double ssim_total = 0;
-
-  // sample point start with each 4x4 location
-  for (i = 0; i <= height - 8;
-       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
-    for (j = 0; j <= width - 8; j += 4) {
-      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
-      ssim_total += v;
-      samples++;
-    }
+    orig += orig_stride;
+    recon += recon_stride;
   }
-  ssim_total /= samples;
-  return ssim_total;
+  return total_sse;
 }
-
+#endif
 static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
                                  int recon_stride, unsigned int cols,
                                  unsigned int rows) {
@@ -103,7 +67,7 @@ static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon,
 }
 
 #define MAX_PSNR 100
-double vp9_mse2psnr(double samples, double peak, double mse) {
+static double mse2psnr(double samples, double peak, double mse) {
   double psnr;
 
   if (mse > 0.0)
@@ -126,10 +90,12 @@ typedef struct input_file {
   vpx_image_t img;
   int w;
   int h;
+  int bit_depth;
 } input_file_t;
 
 // Open a file and determine if its y4m or raw.  If y4m get the header.
-int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
+static int open_input_file(const char *file_name, input_file_t *input, int w,
+                           int h, int bit_depth) {
   char y4m_buf[4];
   size_t r1;
   input->type = RAW_YUV;
@@ -144,6 +110,7 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
         y4m_input_open(&input->y4m, input->file, y4m_buf, 4, 0);
         input->w = input->y4m.pic_w;
         input->h = input->y4m.pic_h;
+        input->bit_depth = input->y4m.bit_depth;
         // Y4M alloc's its own buf. Init this to avoid problems if we never
         // read frames.
         memset(&input->img, 0, sizeof(input->img));
@@ -152,14 +119,17 @@ int open_input_file(const char *file_name, input_file_t *input, int w, int h) {
         fseek(input->file, 0, SEEK_SET);
         input->w = w;
         input->h = h;
-        input->buf = malloc(w * h * 3 / 2);
+        if (bit_depth < 9)
+          input->buf = malloc(w * h * 3 / 2);
+        else
+          input->buf = malloc(w * h * 3);
         break;
     }
   }
   return 0;
 }
 
-void close_input_file(input_file_t *in) {
+static void close_input_file(input_file_t *in) {
   if (in->file) fclose(in->file);
   if (in->type == Y4M) {
     vpx_img_free(&in->img);
@@ -168,8 +138,8 @@ void close_input_file(input_file_t *in) {
   }
 }
 
-size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
-                       unsigned char **v) {
+static size_t read_input_file(input_file_t *in, unsigned char **y,
+                              unsigned char **u, unsigned char **v, int bd) {
   size_t r1 = 0;
   switch (in->type) {
     case Y4M:
@@ -179,18 +149,429 @@ size_t read_input_file(input_file_t *in, unsigned char **y, unsigned char **u,
       *v = in->img.planes[2];
       break;
     case RAW_YUV:
-      r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
-      *y = in->buf;
-      *u = in->buf + in->w * in->h;
-      *v = in->buf + 5 * in->w * in->h / 4;
+      if (bd < 9) {
+        r1 = fread(in->buf, in->w * in->h * 3 / 2, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + in->w * in->h;
+        *v = in->buf + 5 * in->w * in->h / 4;
+      } else {
+        r1 = fread(in->buf, in->w * in->h * 3, 1, in->file);
+        *y = in->buf;
+        *u = in->buf + in->w * in->h / 2;
+        *v = *u + in->w * in->h / 2;
+      }
       break;
   }
 
   return r1;
 }
 
+void ssim_parms_16x16(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                      uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                      uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+void ssim_parms_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                    uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                    uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+void highbd_ssim_parms_8x8(const uint16_t *s, int sp, const uint16_t *r, int rp,
+                           uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                           uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  int64_t ssim_n, ssim_d;
+  int64_t c1 = 0, c2 = 0;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    assert(0);
+  }
+
+  ssim_n = (2 * sum_s * sum_r + c1) *
+           ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2);
+
+  ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) *
+           ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s +
+            (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2);
+
+  return ssim_n * 1.0 / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                        &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+static double ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                    int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+static double highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                           int stride_img1, int stride_img2, int width,
+                           int height, uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                 &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, int width, int height, Ssimv *sv2,
+                        Metrics *m, int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1) ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0) inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+
+double highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest, double *weight,
+                        uint32_t bd, uint32_t in_bd) {
+  double a, b, c;
+  double ssimv;
+  uint32_t shift = 0;
+
+  assert(bd >= in_bd);
+  shift = bd - in_bd;
+
+  a = highbd_ssim2(source->y_buffer, dest->y_buffer, source->y_stride,
+                   dest->y_stride, source->y_crop_width, source->y_crop_height,
+                   in_bd, shift);
+
+  b = highbd_ssim2(source->u_buffer, dest->u_buffer, source->uv_stride,
+                   dest->uv_stride, source->uv_crop_width,
+                   source->uv_crop_height, in_bd, shift);
+
+  c = highbd_ssim2(source->v_buffer, dest->v_buffer, source->uv_stride,
+                   dest->uv_stride, source->uv_crop_width,
+                   source->uv_crop_height, in_bd, shift);
+
+  ssimv = a * .8 + .1 * (b + c);
+
+  *weight = 1;
+
+  return ssimv;
+}
+
 int main(int argc, char *argv[]) {
   FILE *framestats = NULL;
+  int bit_depth = 8;
   int w = 0, h = 0, tl_skip = 0, tl_skips_remaining = 0;
   double ssimavg = 0, ssimyavg = 0, ssimuavg = 0, ssimvavg = 0;
   double psnrglb = 0, psnryglb = 0, psnruglb = 0, psnrvglb = 0;
@@ -200,11 +581,12 @@ int main(int argc, char *argv[]) {
   size_t i, n_frames = 0, allocated_frames = 0;
   int return_value = 0;
   input_file_t in[2];
+  double peak = 255.0;
 
   if (argc < 2) {
     fprintf(stderr,
             "Usage: %s file1.{yuv|y4m} file2.{yuv|y4m}"
-            "[WxH tl_skip={0,1,3}]\n",
+            "[WxH tl_skip={0,1,3} frame_stats_file bits]\n",
             argv[0]);
     return_value = 1;
     goto clean_up;
@@ -214,7 +596,11 @@ int main(int argc, char *argv[]) {
     sscanf(argv[3], "%dx%d", &w, &h);
   }
 
-  if (open_input_file(argv[1], &in[0], w, h) < 0) {
+  if (argc > 6) {
+    sscanf(argv[6], "%d", &bit_depth);
+  }
+
+  if (open_input_file(argv[1], &in[0], w, h, bit_depth) < 0) {
     fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
     goto clean_up;
   }
@@ -223,9 +609,13 @@ int main(int argc, char *argv[]) {
     // If a y4m is the first file and w, h is not set grab from first file.
     w = in[0].w;
     h = in[0].h;
+    bit_depth = in[0].bit_depth;
   }
+  if (bit_depth == 10) peak = 1023.0;
 
-  if (open_input_file(argv[2], &in[1], w, h) < 0) {
+  if (bit_depth == 12) peak = 4095;
+
+  if (open_input_file(argv[2], &in[1], w, h, bit_depth) < 0) {
     fprintf(stderr, "File %s can't be opened or parsed!\n", argv[2]);
     goto clean_up;
   }
@@ -264,7 +654,7 @@ int main(int argc, char *argv[]) {
     size_t r1, r2;
     unsigned char *y[2], *u[2], *v[2];
 
-    r1 = read_input_file(&in[0], &y[0], &u[0], &v[0]);
+    r1 = read_input_file(&in[0], &y[0], &u[0], &v[0], bit_depth);
 
     if (r1) {
       // Reading parts of file1.yuv that were not used in temporal layer.
@@ -276,7 +666,7 @@ int main(int argc, char *argv[]) {
       tl_skips_remaining = tl_skip;
     }
 
-    r2 = read_input_file(&in[1], &y[1], &u[1], &v[1]);
+    r2 = read_input_file(&in[1], &y[1], &u[1], &v[1], bit_depth);
 
     if (r1 && r2 && r1 != r2) {
       fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno),
@@ -286,9 +676,22 @@ int main(int argc, char *argv[]) {
     } else if (r1 == 0 || r2 == 0) {
       break;
     }
+#if CONFIG_VP9_HIGHBITDEPTH
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                            \
+  if (bit_depth < 9) {                                                         \
+    ssim = ssim2(buf0, buf1, w, w, w, h);                                      \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
+  } else {                                                                     \
+    ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
+                        w, w, h, bit_depth, bit_depth - 8);                    \
+    psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
+                              CAST_TO_SHORTPTR(buf1), w, w, h);                \
+  }
+#else
 #define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = vp8_ssim2(buf0, buf1, w, w, w, h);         \
+  ssim = ssim2(buf0, buf1, w, w, w, h);             \
   psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#endif
 
     if (n_frames == allocated_frames) {
       allocated_frames = allocated_frames == 0 ? 1024 : allocated_frames * 2;
@@ -321,11 +724,11 @@ int main(int argc, char *argv[]) {
     ssimuavg += ssimu[i];
     ssimvavg += ssimv[i];
 
-    frame_psnr = vp9_mse2psnr(w * h * 6 / 4, 255.0,
-                              (double)psnry[i] + psnru[i] + psnrv[i]);
-    frame_psnry = vp9_mse2psnr(w * h * 4 / 4, 255.0, (double)psnry[i]);
-    frame_psnru = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnru[i]);
-    frame_psnrv = vp9_mse2psnr(w * h * 1 / 4, 255.0, (double)psnrv[i]);
+    frame_psnr =
+        mse2psnr(w * h * 6 / 4, peak, (double)psnry[i] + psnru[i] + psnrv[i]);
+    frame_psnry = mse2psnr(w * h * 4 / 4, peak, (double)psnry[i]);
+    frame_psnru = mse2psnr(w * h * 1 / 4, peak, (double)psnru[i]);
+    frame_psnrv = mse2psnr(w * h * 1 / 4, peak, (double)psnrv[i]);
 
     psnravg += frame_psnr;
     psnryavg += frame_psnry;
@@ -367,10 +770,10 @@ int main(int argc, char *argv[]) {
   puts("");
 
   psnrglb = psnryglb + psnruglb + psnrvglb;
-  psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb);
-  psnryglb = vp9_mse2psnr((double)n_frames * w * h * 4 / 4, 255.0, psnryglb);
-  psnruglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnruglb);
-  psnrvglb = vp9_mse2psnr((double)n_frames * w * h * 1 / 4, 255.0, psnrvglb);
+  psnrglb = mse2psnr((double)n_frames * w * h * 6 / 4, peak, psnrglb);
+  psnryglb = mse2psnr((double)n_frames * w * h * 4 / 4, peak, psnryglb);
+  psnruglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnruglb);
+  psnrvglb = mse2psnr((double)n_frames * w * h * 1 / 4, peak, psnrvglb);
 
   printf("GlbPSNR: %lf\n", psnrglb);
   printf("GlbPSNR-Y: %lf\n", psnryglb);
diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
index 74fc5d6db..1a3aad16a 100644
--- a/libvpx/vp8/common/blockd.h
+++ b/libvpx/vp8/common/blockd.h
@@ -169,6 +169,11 @@ typedef struct {
 typedef struct {
   FRAME_TYPE frame_type;
   int is_frame_dropped;
+  // If frame is dropped due to overshoot after encode_frame. This triggers a
+  // drop and resets rate control with Q forced to max for following frame.
+  // The check for this dropping due to overshoot is only done on lowest stream,
+  // and if set will force drop on all spatial streams for that current frame.
+  int is_frame_dropped_overshoot_maxqp;
   // The frame rate for the lowest resolution.
   double low_res_framerate;
   /* The frame number of each reference frames */
diff --git a/libvpx/vp8/common/loopfilter_filters.c b/libvpx/vp8/common/loopfilter_filters.c
index 2a7cde878..188e290ca 100644
--- a/libvpx/vp8/common/loopfilter_filters.c
+++ b/libvpx/vp8/common/loopfilter_filters.c
@@ -86,10 +86,12 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0,
   u = vp8_signed_char_clamp(ps1 + filter_value);
   *op1 = u ^ 0x80;
 }
-void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+
+static void loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -109,10 +111,11 @@ void vp8_loop_filter_horizontal_edge_c(unsigned char *s, int p, /* pitch */
   } while (++i < count * 8);
 }
 
-void vp8_loop_filter_vertical_edge_c(unsigned char *s, int p,
-                                     const unsigned char *blimit,
-                                     const unsigned char *limit,
-                                     const unsigned char *thresh, int count) {
+static void loop_filter_vertical_edge_c(unsigned char *s, int p,
+                                        const unsigned char *blimit,
+                                        const unsigned char *limit,
+                                        const unsigned char *thresh,
+                                        int count) {
   int hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -185,11 +188,11 @@ static void vp8_mbfilter(signed char mask, uc hev, uc *op2, uc *op1, uc *op0,
   *op2 = s ^ 0x80;
 }
 
-void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
-                                         const unsigned char *blimit,
-                                         const unsigned char *limit,
-                                         const unsigned char *thresh,
-                                         int count) {
+static void mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
+                                            const unsigned char *blimit,
+                                            const unsigned char *limit,
+                                            const unsigned char *thresh,
+                                            int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -210,10 +213,11 @@ void vp8_mbloop_filter_horizontal_edge_c(unsigned char *s, int p,
   } while (++i < count * 8);
 }
 
-void vp8_mbloop_filter_vertical_edge_c(unsigned char *s, int p,
-                                       const unsigned char *blimit,
-                                       const unsigned char *limit,
-                                       const unsigned char *thresh, int count) {
+static void mbloop_filter_vertical_edge_c(unsigned char *s, int p,
+                                          const unsigned char *blimit,
+                                          const unsigned char *limit,
+                                          const unsigned char *thresh,
+                                          int count) {
   signed char hev = 0; /* high edge variance */
   signed char mask = 0;
   int i = 0;
@@ -295,17 +299,17 @@ void vp8_loop_filter_simple_vertical_edge_c(unsigned char *s, int p,
 void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 2);
+  mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                        lfi->hev_thr, 1);
+    mbloop_filter_horizontal_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                    lfi->hev_thr, 1);
   }
 }
 
@@ -313,17 +317,17 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                            unsigned char *v_ptr, int y_stride, int uv_stride,
                            loop_filter_info *lfi) {
-  vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
-                                    lfi->hev_thr, 2);
+  mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
-                                      lfi->hev_thr, 1);
+    mbloop_filter_vertical_edge_c(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                  lfi->hev_thr, 1);
   }
 }
 
@@ -331,21 +335,21 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr,
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
-  vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
-                                    lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
+  loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->blim,
+                                lfi->lim, lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(u_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride,
-                                      lfi->blim, lfi->lim, lfi->hev_thr, 1);
+    loop_filter_horizontal_edge_c(v_ptr + 4 * uv_stride, uv_stride, lfi->blim,
+                                  lfi->lim, lfi->hev_thr, 1);
   }
 }
 
@@ -363,21 +367,21 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, int y_stride,
 void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr,
                           unsigned char *v_ptr, int y_stride, int uv_stride,
                           loop_filter_info *lfi) {
-  vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
-  vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
-                                  lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
+  loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                              lfi->hev_thr, 2);
 
   if (u_ptr) {
-    vp8_loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 
   if (v_ptr) {
-    vp8_loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
-                                    lfi->hev_thr, 1);
+    loop_filter_vertical_edge_c(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                lfi->hev_thr, 1);
   }
 }
 
diff --git a/libvpx/vp8/common/mfqe.c b/libvpx/vp8/common/mfqe.c
index 5aace8c99..b6f8146b8 100644
--- a/libvpx/vp8/common/mfqe.c
+++ b/libvpx/vp8/common/mfqe.c
@@ -74,8 +74,7 @@ static void apply_ifactor(unsigned char *y_src, int y_src_stride,
                             src_weight);
     vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride,
                             src_weight);
-  } else /* if (block_size == 8) */
-  {
+  } else {
     vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride,
                             src_weight);
     vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride,
@@ -136,8 +135,7 @@ static void multiframe_quality_enhance_block(
     usad = (vpx_sad8x8(u, uv_stride, ud, uvd_stride) + 32) >> 6;
     vsad = (vpx_sad8x8(v, uv_stride, vd, uvd_stride) + 32) >> 6;
 #endif
-  } else /* if (blksize == 8) */
-  {
+  } else {
     actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
     act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse) + 32) >> 6;
 #ifdef USE_SSD
@@ -186,14 +184,12 @@ static void multiframe_quality_enhance_block(
       apply_ifactor(y, y_stride, yd, yd_stride, u, v, uv_stride, ud, vd,
                     uvd_stride, blksize, ifactor);
     }
-  } else /* else implicitly copy from previous frame */
-  {
+  } else { /* else implicitly copy from previous frame */
     if (blksize == 16) {
       vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
       vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
       vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
-    } else /* if (blksize == 8) */
-    {
+    } else {
       vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
       for (up = u, udp = ud, i = 0; i < uvblksize;
            ++i, up += uv_stride, udp += uvd_stride) {
@@ -297,8 +293,7 @@ void vp8_multiframe_quality_enhance(VP8_COMMON *cm) {
               }
             }
           }
-        } else /* totmap = 4 */
-        {
+        } else { /* totmap = 4 */
           multiframe_quality_enhance_block(
               16, qcurr, qprev, y_ptr, u_ptr, v_ptr, show->y_stride,
               show->uv_stride, yd_ptr, ud_ptr, vd_ptr, dest->y_stride,
diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index 2de343419..e46827b0e 100644
--- a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -673,9 +673,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a), [p2] "r"(p2),
+          [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr));
 
@@ -724,9 +724,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
-          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
+          [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
-          [p2] "r"(p2), [n2] "r"(n2), [p4] "r"(p4), [n4] "r"(n4), [p1] "r"(p1),
+          [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
           [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b));
 
@@ -781,9 +781,9 @@ void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
 
         : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
-          [Temp4] "=r"(Temp4)
-        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
-          [p4] "r"(p4), [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
+          [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
+        : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
+          [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
           [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
 
diff --git a/libvpx/vp8/common/mips/mmi/copymem_mmi.c b/libvpx/vp8/common/mips/mmi/copymem_mmi.c
new file mode 100644
index 000000000..86a32aa9e
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/copymem_mmi.c
@@ -0,0 +1,114 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define COPY_MEM_16X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp0],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp0],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                    \n\t" \
+  "ldl        %[tmp1],    0x0f(%[src])                    \n\t" \
+  "ldr        %[tmp1],    0x08(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "gssdlc1    %[ftmp1],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp1],   0x00(%[dst])                    \n\t" \
+  "sdl        %[tmp1],    0x0f(%[dst])                    \n\t" \
+  "sdr        %[tmp1],    0x08(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+#define COPY_MEM_8X2 \
+  "gsldlc1    %[ftmp0],   0x07(%[src])                    \n\t" \
+  "gsldrc1    %[ftmp0],   0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+  "ldl        %[tmp0],    0x07(%[src])                    \n\t" \
+  "ldr        %[tmp0],    0x00(%[src])                    \n\t" \
+  MMI_ADDU(%[src],     %[src],         %[src_stride])           \
+                                                                \
+  "gssdlc1    %[ftmp0],   0x07(%[dst])                    \n\t" \
+  "gssdrc1    %[ftmp0],   0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],      %[dst],        %[dst_stride])           \
+  "sdl        %[tmp0],    0x07(%[dst])                    \n\t" \
+  "sdr        %[tmp0],    0x00(%[dst])                    \n\t" \
+  MMI_ADDU(%[dst],     %[dst],         %[dst_stride])
+
+void vp8_copy_mem16x16_mmi(unsigned char *src, int src_stride,
+                           unsigned char *dst, int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[2];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_16X2
+    COPY_MEM_16X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x8_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+  uint8_t loop_count = 4;
+
+  /* clang-format off */
+  __asm__ volatile (
+    "1:                                                     \n\t"
+    COPY_MEM_8X2
+    MMI_ADDIU(%[loop_count], %[loop_count], -0x01)
+    "bnez       %[loop_count],    1b                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),              [loop_count]"+&r"(loop_count),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
+
+void vp8_copy_mem8x4_mmi(unsigned char *src, int src_stride, unsigned char *dst,
+                         int dst_stride) {
+  double ftmp[2];
+  uint64_t tmp[1];
+
+  /* clang-format off */
+  __asm__ volatile (
+    COPY_MEM_8X2
+    COPY_MEM_8X2
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [tmp0]"=&r"(tmp[0]),
+      [dst]"+&r"(dst),                  [src]"+&r"(src)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+  /* clang-format on */
+}
diff --git a/libvpx/vp8/common/mips/mmi/dequantize_mmi.c b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
new file mode 100644
index 000000000..b3f8084ae
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/dequantize_mmi.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vp8_dequantize_b_mmi(BLOCKD *d, int16_t *DQC) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[qcoeff])                 \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[qcoeff])                 \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[qcoeff])                 \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[DQC])                    \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[DQC])                    \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[DQC])                    \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[dqcoeff])                \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[dqcoeff])                \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[dqcoeff])                \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dqcoeff] "r"(d->dqcoeff), [qcoeff] "r"(d->qcoeff), [DQC] "r"(DQC)
+      : "memory");
+}
+
+void vp8_dequant_idct_add_mmi(int16_t *input, int16_t *dq, unsigned char *dest,
+                              int stride) {
+  double ftmp[8];
+
+  __asm__ volatile(
+      "gsldlc1    %[ftmp0],   0x07(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp0],   0x00(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp1],   0x0f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp1],   0x08(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp2],   0x17(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp2],   0x10(%[dq])                     \n\t"
+      "gsldlc1    %[ftmp3],   0x1f(%[dq])                     \n\t"
+      "gsldrc1    %[ftmp3],   0x18(%[dq])                     \n\t"
+
+      "gsldlc1    %[ftmp4],   0x07(%[input])                  \n\t"
+      "gsldrc1    %[ftmp4],   0x00(%[input])                  \n\t"
+      "gsldlc1    %[ftmp5],   0x0f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp5],   0x08(%[input])                  \n\t"
+      "gsldlc1    %[ftmp6],   0x17(%[input])                  \n\t"
+      "gsldrc1    %[ftmp6],   0x10(%[input])                  \n\t"
+      "gsldlc1    %[ftmp7],   0x1f(%[input])                  \n\t"
+      "gsldrc1    %[ftmp7],   0x18(%[input])                  \n\t"
+
+      "pmullh     %[ftmp0],   %[ftmp0],       %[ftmp4]        \n\t"
+      "pmullh     %[ftmp1],   %[ftmp1],       %[ftmp5]        \n\t"
+      "pmullh     %[ftmp2],   %[ftmp2],       %[ftmp6]        \n\t"
+      "pmullh     %[ftmp3],   %[ftmp3],       %[ftmp7]        \n\t"
+
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "gssdlc1    %[ftmp1],   0x0f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp1],   0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp2],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp2],   0x10(%[input])                  \n\t"
+      "gssdlc1    %[ftmp3],   0x1f(%[input])                  \n\t"
+      "gssdrc1    %[ftmp3],   0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7])
+      : [dq] "r"(dq), [input] "r"(input)
+      : "memory");
+
+  vp8_short_idct4x4llm_mmi(input, dest, stride, dest, stride);
+
+  __asm__ volatile(
+      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gssdlc1    %[ftmp0],   0x07(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x00(%[input])                  \n\t"
+      "sdl        $0,         0x0f(%[input])                  \n\t"
+      "sdr        $0,         0x08(%[input])                  \n\t"
+      "gssdlc1    %[ftmp0],   0x17(%[input])                  \n\t"
+      "gssdrc1    %[ftmp0],   0x10(%[input])                  \n\t"
+      "sdl        $0,         0x1f(%[input])                  \n\t"
+      "sdr        $0,         0x18(%[input])                  \n\t"
+      : [ftmp0] "=&f"(ftmp[0])
+      : [input] "r"(input)
+      : "memory");
+}
diff --git a/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 000000000..f6020ab46
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,71 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int stride, int8_t *eobs) {
+  int i, j;
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 4; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dst += 4;
+    }
+
+    dst += 4 * stride - 16;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
+                                       uint8_t *dstv, int stride,
+                                       int8_t *eobs) {
+  int i, j;
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dstu, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dstu += 4;
+    }
+
+    dstu += 4 * stride - 8;
+  }
+
+  for (i = 0; i < 2; i++) {
+    for (j = 0; j < 2; j++) {
+      if (*eobs++ > 1) {
+        vp8_dequant_idct_add_mmi(q, dq, dstv, stride);
+      } else {
+        vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride);
+        memset(q, 0, 2 * sizeof(q[0]));
+      }
+
+      q += 16;
+      dstv += 4;
+    }
+
+    dstv += 4 * stride - 8;
+  }
+}
diff --git a/libvpx/vp8/common/mips/mmi/idctllm_mmi.c b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
new file mode 100644
index 000000000..5e48f5916
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/idctllm_mmi.c
@@ -0,0 +1,328 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+#define TRANSPOSE_4H \
+  "xor           %[ftmp0],    %[ftmp0],    %[ftmp0]          \n\t" \
+  MMI_LI(%[tmp0], 0x93)                                            \
+  "mtc1          %[tmp0],     %[ftmp10]                      \n\t" \
+  "punpcklhw     %[ftmp5],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp5],    %[ftmp5],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp6],    %[ftmp1],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp2],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp6],    %[ftmp6],    %[ftmp9]          \n\t" \
+  "punpcklhw     %[ftmp7],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpcklhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp7],    %[ftmp7],    %[ftmp9]          \n\t" \
+  "punpckhhw     %[ftmp8],    %[ftmp3],    %[ftmp0]          \n\t" \
+  "punpckhhw     %[ftmp9],    %[ftmp4],    %[ftmp0]          \n\t" \
+  "pshufh        %[ftmp9],    %[ftmp9],    %[ftmp10]         \n\t" \
+  "or            %[ftmp8],    %[ftmp8],    %[ftmp9]          \n\t" \
+  "punpcklwd     %[ftmp1],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpckhwd     %[ftmp2],    %[ftmp5],    %[ftmp7]          \n\t" \
+  "punpcklwd     %[ftmp3],    %[ftmp6],    %[ftmp8]          \n\t" \
+  "punpckhwd     %[ftmp4],    %[ftmp6],    %[ftmp8]          \n\t"
+
+void vp8_short_idct4x4llm_mmi(int16_t *input, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  double ftmp[12];
+  uint32_t tmp[0];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_04) = { 0x0004000400040004ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_4e7b) = { 0x4e7b4e7b4e7b4e7bULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_22a3) = { 0x22a322a322a322a3ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x02)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+
+    // ip[0...3] + ip[8...11]
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // ip[0...3] - ip[8...11]
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // (ip[12...15] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // (ip[ 4... 7] * sinpi8sqrt2) >> 16
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    // ip[ 4... 7] + ((ip[ 4... 7] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp9],   %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp9],   %[ftmp9],       %[ftmp2]            \n\t"
+    // ip[12...15] + ((ip[12...15] * cospi8sqrt2minus1) >> 16)
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp2],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp10]           \n\t"
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // b
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp3]            \n\t"
+    // c
+    "psllh      %[ftmp9],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp9],       %[ftmp4]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp4],       %[ff_ph_4e7b]       \n\t"
+    "psubh      %[ftmp7],   %[ftmp7],       %[ftmp10]           \n\t"
+    // d
+    "psllh      %[ftmp9],   %[ftmp4],       %[ftmp11]           \n\t"
+    "pmulhh     %[ftmp9],   %[ftmp9],       %[ff_ph_22a3]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp9],       %[ftmp2]            \n\t"
+    "pmulhh     %[ftmp10],  %[ftmp2],       %[ff_ph_4e7b]       \n\t"
+    "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_04]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+#else
+    "gslwlc1    %[ftmp5],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp5],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp5],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packushb   %[ftmp1],   %[ftmp1],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+    "gslwlc1    %[ftmp6],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp6],   %[ftmp6],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp2],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp2],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+#else
+    "gslwlc1    %[ftmp7],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp7],   %[ftmp7],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packushb   %[ftmp3],   %[ftmp3],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp3],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp3],   0x00(%[dst_ptr])                    \n\t"
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+
+#if _MIPS_SIM == _ABIO32
+    "ulw        %[tmp0],    0x00(%[pred_prt])                   \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+    "gslwlc1    %[ftmp8],   0x03(%[pred_ptr])                   \n\t"
+    "gslwrc1    %[ftmp8],   0x00(%[pred_ptr])                   \n\t"
+#endif
+    "punpcklbh  %[ftmp8],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+    "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t"
+    "gsswlc1    %[ftmp4],   0x03(%[dst_ptr])                    \n\t"
+    "gsswrc1    %[ftmp4],   0x00(%[dst_ptr])                    \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0]),
+      [pred_ptr]"+&r"(pred_ptr), [dst_ptr]"+&r"(dst_ptr)
+    : [ip]"r"(input), [ff_ph_22a3]"f"(ff_ph_22a3),
+      [ff_ph_4e7b]"f"(ff_ph_4e7b), [ff_ph_04]"f"(ff_ph_04),
+      [pred_stride]"r"((mips_reg)pred_stride),
+      [dst_stride]"r"((mips_reg)dst_stride)
+    : "memory"
+  );
+}
+
+void vp8_dc_only_idct_add_mmi(int16_t input_dc, unsigned char *pred_ptr,
+                              int pred_stride, unsigned char *dst_ptr,
+                              int dst_stride) {
+  int a1 = ((input_dc + 4) >> 3);
+  double ftmp[5];
+  int low32;
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "pshufh     %[a1],      %[a1],          %[ftmp0]        \n\t"
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+
+    MMI_ADDU(%[pred_ptr], %[pred_ptr], %[pred_stride])
+    MMI_ADDU(%[dst_ptr], %[dst_ptr], %[dst_stride])
+    "ulw        %[low32],   0x00(%[pred_ptr])               \n\t"
+    "mtc1       %[low32],   %[ftmp1]                        \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]        \n\t"
+    "paddsh     %[ftmp2],   %[ftmp2],       %[a1]           \n\t"
+    "packushb   %[ftmp1],   %[ftmp2],       %[ftmp0]        \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[dst_ptr])                \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[dst_ptr])                \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [low32]"=&r"(low32),
+      [dst_ptr]"+&r"(dst_ptr), [pred_ptr]"+&r"(pred_ptr)
+    : [dst_stride]"r"((mips_reg)dst_stride),
+      [pred_stride]"r"((mips_reg)pred_stride), [a1]"f"(a1)
+    : "memory"
+  );
+}
+
+void vp8_short_inv_walsh4x4_mmi(int16_t *input, int16_t *mb_dqcoeff) {
+  int i;
+  int16_t output[16];
+  double ftmp[12];
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_03) = { 0x0003000300030003ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x03)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp3],   0x17(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x10(%[ip])                         \n\t"
+    "gsldlc1    %[ftmp4],   0x1f(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x18(%[ip])                         \n\t"
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp2]            \n\t"
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp2]            \n\t"
+    "paddh      %[ftmp7],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubh      %[ftmp8],   %[ftmp3],       %[ftmp4]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp6],       %[ftmp8]            \n\t"
+    "paddh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    TRANSPOSE_4H
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]            \n\t"
+    // d
+    "psubh      %[ftmp6],   %[ftmp1],       %[ftmp4]            \n\t"
+    // b
+    "paddh      %[ftmp7],   %[ftmp2],       %[ftmp3]            \n\t"
+    // c
+    "psubh      %[ftmp8],   %[ftmp2],       %[ftmp3]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp7]            \n\t"
+    "paddh      %[ftmp2],   %[ftmp6],       %[ftmp8]            \n\t"
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp7]            \n\t"
+    "psubh      %[ftmp4],   %[ftmp6],       %[ftmp8]            \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp1],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_03]         \n\t"
+    "psrah      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    TRANSPOSE_4H
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), [ftmp2]"=&f"(ftmp[2]),
+      [ftmp3]"=&f"(ftmp[3]), [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), [ftmp8]"=&f"(ftmp[8]),
+      [ftmp9]"=&f"(ftmp[9]), [ftmp10]"=&f"(ftmp[10]),
+      [ftmp11]"=&f"(ftmp[11]), [tmp0]"=&r"(tmp[0])
+    : [ip]"r"(input), [op]"r"(output), [ff_ph_03]"f"(ff_ph_03)
+    : "memory"
+  );
+
+  for (i = 0; i < 16; i++) {
+    mb_dqcoeff[i * 16] = output[i];
+  }
+}
diff --git a/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
new file mode 100644
index 000000000..f2182f95c
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/loopfilter_filters_mmi.c
@@ -0,0 +1,1337 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vp8/common/onyxc_int.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_003f) = { 0x003f003f003f003fULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_0900) = { 0x0900090009000900ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_1200) = { 0x1200120012001200ULL };
+DECLARE_ALIGNED(8, static const uint64_t,
+                ff_ph_1b00) = { 0x1b001b001b001b00ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_fe) = { 0xfefefefefefefefeULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_80) = { 0x8080808080808080ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_04) = { 0x0404040404040404ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_03) = { 0x0303030303030303ULL };
+DECLARE_ALIGNED(8, static const uint64_t, ff_pb_01) = { 0x0101010101010101ULL };
+
+void vp8_loop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[12];
+  __asm__ volatile (
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[limit])                          \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp1],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsldlc1    %[ftmp3],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp9],   %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp8],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp10]           \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "gsldlc1    %[ftmp10],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ftmp10]           \n\t"
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+
+    "gsldlc1    %[ftmp10],  0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp10],  0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp8],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    "paddsb     %[ftmp9],   %[ftmp2],           %[ff_pb_04]         \n\t"
+
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "packsshb   %[ftmp8],   %[ftmp0],           %[ftmp11]           \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    "xor        %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    "punpckhbh  %[ftmp9],   %[ftmp11],          %[ftmp9]            \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "paddsh     %[ftmp11],  %[ftmp0],           %[ff_ph_01]         \n\t"
+    "packsshb   %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "paddsh     %[ftmp9],   %[ftmp9],           %[ff_ph_01]         \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "psrah      %[ftmp11],  %[ftmp11],          %[ftmp10]           \n\t"
+    "psrah      %[ftmp9],   %[ftmp9],           %[ftmp10]           \n\t"
+    "packsshb   %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t"
+    "pandn      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp5],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp4],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[addr1])                          \n\t"
+
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_fe]"f"(ff_pb_fe),
+      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_04]"f"(ff_pb_04),
+      [ff_pb_03]"f"(ff_pb_03)
+    : "memory"
+  );
+}
+
+void vp8_loop_filter_vertical_edge_mmi(unsigned char *src_ptr,
+                                       int src_pixel_step,
+                                       const unsigned char *blimit,
+                                       const unsigned char *limit,
+                                       const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  mips_reg addr[2];
+  double ftmp[13];
+
+  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SLL (%[tmp0], %[src_pixel_step], 0x01)
+    MMI_ADDU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "gsldlc1    %[ftmp11],  0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[src_ptr])                        \n\t"
+    "gsldlc1    %[ftmp12],  0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr0])                          \n\t"
+    "punpcklbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp4],   %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp4],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp3],           %[ftmp1]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsldlc1    %[ftmp11],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp11],  0x00(%[addr1])                          \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsldlc1    %[ftmp12],  0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp12]           \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp11],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp0],           %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[limit])                          \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[blimit])                         \n\t"
+
+    /* abs (p0-q0) */
+    "pasubub    %[ftmp11],  %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp11],  %[ftmp11],          %[ftmp11]           \n\t"
+    /* abs (p1-q1) */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp1]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp1]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp11],          %[ftmp12]           \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp0:mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "gsldlc1    %[ftmp8],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[thresh])                         \n\t"
+
+    /* ftmp3: abs(q1-q0)  ftmp4: abs(p1-p0) */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp8]            \n\t"
+    "or         %[ftmp2],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1:hev */
+    "xor        %[ftmp1],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp2],   %[ftmp5],           %[ftmp10]           \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2:filter_value */
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp11],  %[ftmp2],           %[ff_pb_04]         \n\t"
+    "paddsb     %[ftmp12],  %[ftmp2],           %[ff_pb_03]         \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp12]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp12]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp12],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "xor       %[ftmp0],    %[ftmp0],           %[ftmp0]            \n\t"
+    "xor       %[ftmp8],    %[ftmp8],           %[ftmp8]            \n\t"
+    "punpcklbh %[ftmp0],    %[ftmp0],           %[ftmp11]           \n\t"
+    "punpckhbh %[ftmp8],    %[ftmp8],           %[ftmp11]           \n\t"
+    "psrah     %[ftmp0],    %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah     %[ftmp8],    %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb  %[ftmp11],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp11]           \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp12]           \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "paddsh     %[ftmp0],   %[ftmp0],           %[ff_ph_01]         \n\t"
+    "paddsh     %[ftmp8],   %[ftmp8],           %[ff_ph_01]         \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp7]                                \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp7]            \n\t"
+    "packsshb   %[ftmp2],   %[ftmp0],           %[ftmp8]            \n\t"
+    "pandn      %[ftmp2],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubsb     %[ftmp10],  %[ftmp10],          %[ftmp2]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+
+    /* ftmp5: *op1 ; ftmp6: *op0 */
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    /* ftmp9: *oq0 ; ftmp10: *oq1 */
+    "punpcklbh  %[ftmp4],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpcklhw  %[ftmp2],   %[ftmp2],           %[ftmp4]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "dsrl       %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp2],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp2],   0x02(%[addr1])                          \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x01)
+    MMI_SUBU(%[addr1], %[src_ptr], %[tmp0])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x02(%[addr1])                          \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[src_ptr])                        \n\t"
+
+    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "gsswlc1    %[ftmp1],   0x05(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x02(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[tmp0])
+    "gsswlc1    %[ftmp5],   0x05(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x02(%[addr1])                          \n\t"
+
+    MMI_ADDIU(%[count], %[count], -0x01)
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_ph_01]"f"(ff_ph_01),          [ff_pb_03]"f"(ff_pb_03),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_fe]"f"(ff_pb_fe)
+    : "memory"
+  );
+}
+
+/* clang-format off */
+#define VP8_MBLOOP_HPSRAB                                               \
+  "punpcklbh  %[ftmp10],  %[ftmp10],          %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp11],          %[ftmp0]            \n\t" \
+  "psrah      %[ftmp10],  %[ftmp10],          %[ftmp9]            \n\t" \
+  "psrah      %[ftmp11],  %[ftmp11],          %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp0],   %[ftmp10],          %[ftmp11]            \n\t"
+
+#define VP8_MBLOOP_HPSRAB_ADD(reg)                                      \
+  "punpcklbh  %[ftmp1],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "punpckhbh  %[ftmp2],   %[ftmp0],           %[ftmp12]           \n\t" \
+  "pmulhh     %[ftmp1],   %[ftmp1],         " #reg "              \n\t" \
+  "pmulhh     %[ftmp2],   %[ftmp2],         " #reg "              \n\t" \
+  "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t" \
+  "psrah      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t" \
+  "packsshb   %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+/* clang-format on */
+
+void vp8_mbloop_filter_horizontal_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  uint32_t tmp[1];
+  double ftmp[13];
+
+  __asm__ volatile (
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp9],   0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[limit])                          \n\t"
+    /* ftmp1: p3 */
+    "gsldlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp3: p2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp4: p1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp5: p0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp6: q0 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp7: q1 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp8: q2 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    /* ftmp2: q3 */
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp2],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[src_ptr])                        \n\t"
+
+    "gsldlc1    %[ftmp12],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp12],  0x00(%[blimit])                         \n\t"
+
+    "pasubub    %[ftmp0],   %[ftmp1],           %[ftmp3]            \n\t"
+    "psubusb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp3],           %[ftmp4]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp10],  %[ftmp4],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp11],  %[ftmp7],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp11],          %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp8],           %[ftmp7]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp1],   %[ftmp2],           %[ftmp8]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp9]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "pasubub    %[ftmp1],   %[ftmp5],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    "pasubub    %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrlh      %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp1],   %[ftmp1],           %[ftmp12]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ftmp9]            \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp9],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp9],   0x00(%[thresh])                         \n\t"
+    "psubusb    %[ftmp1],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp2],   %[ftmp11],          %[ftmp9]            \n\t"
+    "paddb      %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+    "pcmpeqb    %[ftmp2],   %[ftmp2],           %[ftmp2]            \n\t"
+    /* ftmp1: hev */
+    "xor        %[ftmp1],   %[ftmp1],           %[ftmp2]            \n\t"
+
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp4],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp9],   %[ftmp6],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp9]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "pandn      %[ftmp12],  %[ftmp1],           %[ftmp2]            \n\t"
+    "and        %[ftmp2],   %[ftmp2],           %[ftmp1]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_03]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp2],           %[ff_pb_04]         \n\t"
+    VP8_MBLOOP_HPSRAB
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    "li         %[tmp0],    0x07                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]            \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1b00])
+    "psubsb     %[ftmp6],   %[ftmp6],           %[ftmp1]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x02)
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_1200])
+    "paddsb     %[ftmp4],   %[ftmp4],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "xor        %[ftmp4],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp4],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    VP8_MBLOOP_HPSRAB_ADD(%[ff_ph_0900])
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp1]            \n\t"
+    "psubsb     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp8],   %[ftmp8],           %[ff_pb_80]         \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "gssdlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[tmp0])
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [tmp0]"=&r"(tmp[0]),
+      [src_ptr]"+&r"(src_ptr),            [count]"+&r"(count)
+    : [limit]"r"(limit),                  [blimit]"r"(blimit),
+      [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_pb_fe]"f"(ff_pb_fe),            [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),            [ff_pb_03]"f"(ff_pb_03),
+      [ff_ph_0900]"f"(ff_ph_0900),        [ff_ph_1b00]"f"(ff_ph_1b00),
+      [ff_ph_1200]"f"(ff_ph_1200),        [ff_ph_003f]"f"(ff_ph_003f)
+    : "memory"
+  );
+}
+
+#define VP8_MBLOOP_VPSRAB_ADDH                                          \
+  "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t" \
+  "xor        %[ftmp8],   %[ftmp8],           %[ftmp8]            \n\t" \
+  "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp0]            \n\t"
+
+#define VP8_MBLOOP_VPSRAB_ADDT                                          \
+  "paddh      %[ftmp7],   %[ftmp7],           %[ff_ph_003f]       \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],           %[ff_ph_003f]       \n\t" \
+  "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t" \
+  "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t" \
+  "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+
+void vp8_mbloop_filter_vertical_edge_mmi(
+    unsigned char *src_ptr, int src_pixel_step, const unsigned char *blimit,
+    const unsigned char *limit, const unsigned char *thresh, int count) {
+  mips_reg tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  double ftmp[14];
+
+  __asm__ volatile (
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x04)
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp1],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp2],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp3],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp5],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp5],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp7],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp8],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp8],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp11],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp12],  %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp8]            \n\t"
+    "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp8]            \n\t"
+
+    "punpcklhw  %[ftmp5],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],          %[ftmp10]           \n\t"
+    "punpcklhw  %[ftmp7],   %[ftmp11],          %[ftmp9]            \n\t"
+    "punpckhhw  %[ftmp8],   %[ftmp11],          %[ftmp9]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[limit])                          \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[limit])                          \n\t"
+    /* ftmp9:q0  ftmp10:q1 */
+    "punpcklwd  %[ftmp9],   %[ftmp1],           %[ftmp5]            \n\t"
+    "punpckhwd  %[ftmp10],  %[ftmp1],           %[ftmp5]            \n\t"
+    /* ftmp11:q2  ftmp12:q3 */
+    "punpcklwd  %[ftmp11],  %[ftmp2],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp12],  %[ftmp2],           %[ftmp6]            \n\t"
+    /* srct[0x00]: q3 */
+    "sdc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    /* ftmp1:p3  ftmp2:p2 */
+    "punpcklwd  %[ftmp1],   %[ftmp3],           %[ftmp7]            \n\t"
+    "punpckhwd  %[ftmp2],   %[ftmp3],           %[ftmp7]            \n\t"
+    /* srct[0x08]: p3 */
+    "sdc1       %[ftmp1],   0x08(%[srct])                           \n\t"
+    /* ftmp5:p1  ftmp6:p0 */
+    "punpcklwd  %[ftmp5],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp6],   %[ftmp4],           %[ftmp8]            \n\t"
+
+    /* abs (q3-q2) */
+    "pasubub    %[ftmp7],   %[ftmp12],          %[ftmp11]           \n\t"
+    "psubusb    %[ftmp0],   %[ftmp7],           %[ftmp13]           \n\t"
+    /* abs (q2-q1) */
+    "pasubub    %[ftmp7],   %[ftmp11],          %[ftmp10]           \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp3: abs(q1-q0) */
+    "pasubub    %[ftmp3],   %[ftmp10],          %[ftmp9]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp3],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* ftmp4: abs(p1-p0) */
+    "pasubub    %[ftmp4],   %[ftmp5],           %[ftmp6]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp4],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p2-p1) */
+    "pasubub    %[ftmp7],   %[ftmp2],           %[ftmp5]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    /* abs (p3-p2) */
+    "pasubub    %[ftmp7],   %[ftmp1],           %[ftmp2]            \n\t"
+    "psubusb    %[ftmp7],   %[ftmp7],           %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+
+    "gsldlc1    %[ftmp13],  0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp13],  0x00(%[blimit])                         \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[thresh])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[thresh])                         \n\t"
+    /* abs (p0-q0) * 2 */
+    "pasubub    %[ftmp1],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddusb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* abs (p1-q1) / 2 */
+    "pasubub    %[ftmp12],  %[ftmp10],          %[ftmp5]            \n\t"
+    "and        %[ftmp12],  %[ftmp12],          %[ff_pb_fe]         \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "psrlh      %[ftmp12],  %[ftmp12],          %[ftmp8]            \n\t"
+    "paddusb    %[ftmp12],  %[ftmp1],           %[ftmp12]           \n\t"
+    "psubusb    %[ftmp12],  %[ftmp12],          %[ftmp13]           \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],          %[ftmp12]           \n\t"
+    /* ftmp0: mask */
+    "pcmpeqb    %[ftmp0],   %[ftmp0],           %[ftmp12]           \n\t"
+
+    /* abs(p1-p0) - thresh */
+    "psubusb    %[ftmp4],   %[ftmp4],           %[ftmp7]            \n\t"
+    /* abs(q1-q0) - thresh */
+    "psubusb    %[ftmp3],   %[ftmp3],           %[ftmp7]            \n\t"
+    "or         %[ftmp3],   %[ftmp4],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp3],   %[ftmp3],           %[ftmp12]           \n\t"
+    "pcmpeqb    %[ftmp1],   %[ftmp1],           %[ftmp1]            \n\t"
+    /* ftmp1: hev */
+    "xor        %[ftmp1],   %[ftmp3],           %[ftmp1]            \n\t"
+
+    /* ftmp2:ps2, ftmp5:ps1, ftmp6:ps0, ftmp9:qs0, ftmp10:qs1, ftmp11:qs2 */
+    "xor        %[ftmp11],  %[ftmp11],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp10],  %[ftmp10],          %[ff_pb_80]         \n\t"
+    "xor        %[ftmp9],   %[ftmp9],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp5],   %[ftmp5],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+
+    "psubsb     %[ftmp3],   %[ftmp5],           %[ftmp10]           \n\t"
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+    /* filter_value &= mask */
+    "and        %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+    /* Filter2 = filter_value & hev */
+    "and        %[ftmp3],   %[ftmp1],           %[ftmp0]            \n\t"
+    /* filter_value &= ~hev */
+    "pandn      %[ftmp0],   %[ftmp1],           %[ftmp0]            \n\t"
+
+    "paddsb     %[ftmp4],   %[ftmp3],           %[ff_pb_04]         \n\t"
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp4]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp4],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp9: qs0 */
+    "psubsb     %[ftmp9],   %[ftmp9],           %[ftmp4]            \n\t"
+    "paddsb     %[ftmp3],   %[ftmp3],           %[ff_pb_03]         \n\t"
+    "punpcklbh  %[ftmp7],   %[ftmp7],           %[ftmp3]            \n\t"
+    "punpckhbh  %[ftmp8],   %[ftmp8],           %[ftmp3]            \n\t"
+    "psrah      %[ftmp7],   %[ftmp7],           %[ftmp12]           \n\t"
+    "psrah      %[ftmp8],   %[ftmp8],           %[ftmp12]           \n\t"
+    "packsshb   %[ftmp3],   %[ftmp7],           %[ftmp8]            \n\t"
+    /* ftmp6: ps0 */
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp3]            \n\t"
+
+    "li         %[tmp0],    0x07                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp12]                               \n\t"
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp9],           %[ftmp3]            \n\t"
+    /* ftmp9: oq0 */
+    "xor        %[ftmp9],   %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp6],           %[ftmp3]            \n\t"
+    /* ftmp6: op0 */
+    "xor        %[ftmp6],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "paddh      %[ftmp1],   %[ff_ph_0900],      %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ftmp1]            \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ftmp1]            \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp10],          %[ftmp3]            \n\t"
+    /* ftmp10: oq1 */
+    "xor        %[ftmp10],   %[ftmp4],          %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp5],           %[ftmp3]            \n\t"
+    /* ftmp5: op1 */
+    "xor        %[ftmp5],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    VP8_MBLOOP_VPSRAB_ADDH
+    "pmulhh     %[ftmp7],   %[ftmp7],           %[ff_ph_0900]       \n\t"
+    "pmulhh     %[ftmp8],   %[ftmp8],           %[ff_ph_0900]       \n\t"
+    VP8_MBLOOP_VPSRAB_ADDT
+    "psubsb     %[ftmp4],   %[ftmp11],          %[ftmp3]            \n\t"
+    /* ftmp11: oq2 */
+    "xor        %[ftmp11],  %[ftmp4],           %[ff_pb_80]         \n\t"
+    "paddsb     %[ftmp4],   %[ftmp2],           %[ftmp3]            \n\t"
+    /* ftmp2: op2 */
+    "xor        %[ftmp2],   %[ftmp4],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp12],  0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp8],   0x08(%[srct])                           \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp8],           %[ftmp2]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp5],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp6],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp7],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklbh  %[ftmp0],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpckhbh  %[ftmp1],   %[ftmp9],           %[ftmp10]           \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp11],          %[ftmp12]           \n\t"
+    "punpcklhw  %[ftmp8],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp10],  %[ftmp1],           %[ftmp3]            \n\t"
+    "punpckhhw  %[ftmp11],  %[ftmp1],           %[ftmp3]            \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp7],           %[ftmp11]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp7],           %[ftmp11]           \n\t"
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp0],   %[ftmp6],           %[ftmp10]           \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp6],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp5],           %[ftmp9]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp5],           %[ftmp9]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklwd  %[ftmp1],   %[ftmp4],           %[ftmp8]            \n\t"
+    "punpckhwd  %[ftmp0],   %[ftmp4],           %[ftmp8]            \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    MMI_SUBU(%[src_ptr], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp1],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+
+    MMI_SLL(%[tmp0], %[src_pixel_step], 0x03)
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[tmp0])
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),            [ftmp13]"=&f"(ftmp[13]),
+      [tmp0]"=&r"(tmp[0]),                [src_ptr]"+&r"(src_ptr),
+      [count]"+&r"(count)
+    : [limit]"r"(limit),                [blimit]"r"(blimit),
+      [srct]"r"(srct),                  [thresh]"r"(thresh),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [ff_ph_003f]"f"(ff_ph_003f),      [ff_ph_0900]"f"(ff_ph_0900),
+      [ff_pb_03]"f"(ff_pb_03),          [ff_pb_04]"f"(ff_pb_04),
+      [ff_pb_80]"f"(ff_pb_80),          [ff_pb_fe]"f"(ff_pb_fe)
+    : "memory"
+  );
+}
+
+#define VP8_SIMPLE_HPSRAB                                               \
+  "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t" \
+  "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t" \
+  "psrah      %[ftmp1],   %[ftmp5],           %[ftmp10]           \n\t" \
+  "psllh      %[ftmp1],   %[ftmp1],           %[ftmp8]            \n\t" \
+  "or         %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+void vp8_loop_filter_simple_horizontal_edge_mmi(unsigned char *src_ptr,
+                                                int src_pixel_step,
+                                                const unsigned char *blimit) {
+  uint32_t tmp[1], count = 2;
+  mips_reg addr[2];
+  double ftmp[12];
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x08                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                               \n\t"
+
+    "1:                                                             \n\t"
+    "gsldlc1    %[ftmp3],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[blimit])                         \n\t"
+
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsldlc1    %[ftmp2],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp7],   0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[addr0])                          \n\t"
+    "pasubub    %[ftmp1],   %[ftmp7],           %[ftmp2]            \n\t"
+    "and        %[ftmp1],   %[ftmp1],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp1],   %[ftmp1],           %[ftmp11]           \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsldlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gsldrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "gsldlc1    %[ftmp0],   0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],   0x00(%[src_ptr])                        \n\t"
+    "pasubub    %[ftmp5],   %[ftmp6],           %[ftmp0]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp1]            \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ftmp3]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp3]            \n\t"
+
+    "xor        %[ftmp2],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp2],   %[ftmp2],           %[ftmp7]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp2],   %[ftmp2],           %[ftmp0]            \n\t"
+    "and        %[ftmp5],   %[ftmp5],           %[ftmp2]            \n\t"
+
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "gssdlc1    %[ftmp3],   0x07(%[src_ptr])                        \n\t"
+    "gssdrc1    %[ftmp3],   0x00(%[src_ptr])                        \n\t"
+
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+    VP8_SIMPLE_HPSRAB
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gssdlc1    %[ftmp6],   0x07(%[addr1])                          \n\t"
+    "gssdrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    MMI_ADDIU(%[src_ptr], %[src_ptr], 0x08)
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [blimit]"r"(blimit),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+    : "memory"
+  );
+}
+
+void vp8_loop_filter_simple_vertical_edge_mmi(unsigned char *src_ptr,
+                                              int src_pixel_step,
+                                              const unsigned char *blimit) {
+  uint32_t tmp[1], count = 2;
+  mips_reg addr[2];
+  DECLARE_ALIGNED(8, const uint64_t, srct[1]);
+  double ftmp[12];
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x08                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp8]                                \n\t"
+    "li         %[tmp0],    0x20                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                               \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x4])
+    MMI_SUBU(%[src_ptr], %[src_ptr], 0x02)
+
+    "1:                                                             \n\t"
+    MMI_ADDU(%[addr0], %[src_ptr], %[src_pixel_step])
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "gslwlc1    %[ftmp4],   0x03(%[src_ptr])                        \n\t"
+    "gslwrc1    %[ftmp4],   0x00(%[src_ptr])                        \n\t"
+
+    "punpcklbh  %[ftmp4],   %[ftmp4],           %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp4],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp4],   %[ftmp4],           %[ftmp6]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gslwlc1    %[ftmp7],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp7],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gslwlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp6],   %[ftmp6],           %[ftmp7]            \n\t"
+
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp1],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp1],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gslwlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gslwrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp1]            \n\t"
+
+    "punpckhhw  %[ftmp2],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpckhwd  %[ftmp1],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpcklwd  %[ftmp0],   %[ftmp0],           %[ftmp4]            \n\t"
+    "punpckhwd  %[ftmp3],   %[ftmp2],           %[ftmp5]            \n\t"
+    "punpcklwd  %[ftmp2],   %[ftmp2],           %[ftmp5]            \n\t"
+
+    "li         %[tmp0],    0x01                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "pasubub    %[ftmp6],   %[ftmp3],           %[ftmp0]            \n\t"
+    "and        %[ftmp6],   %[ftmp6],           %[ff_pb_fe]         \n\t"
+    "psrlh      %[ftmp6],   %[ftmp6],           %[ftmp9]            \n\t"
+    "pasubub    %[ftmp5],   %[ftmp1],           %[ftmp2]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp5]            \n\t"
+    "paddusb    %[ftmp5],   %[ftmp5],           %[ftmp6]            \n\t"
+
+    "gsldlc1    %[ftmp7],   0x07(%[blimit])                         \n\t"
+    "gsldrc1    %[ftmp7],   0x00(%[blimit])                         \n\t"
+    "psubusb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],           %[ftmp7]            \n\t"
+    "pcmpeqb    %[ftmp5],   %[ftmp5],           %[ftmp7]            \n\t"
+
+    "sdc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "sdc1       %[ftmp3],   0x08(%[srct])                           \n\t"
+
+    "xor        %[ftmp0],   %[ftmp0],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp0],   %[ftmp0],           %[ftmp3]            \n\t"
+
+    "xor        %[ftmp6],   %[ftmp1],           %[ff_pb_80]         \n\t"
+    "xor        %[ftmp3],   %[ftmp2],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp7],   %[ftmp3],           %[ftmp6]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "paddsb     %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "and        %[ftmp5],   %[ftmp5],           %[ftmp0]            \n\t"
+    "paddsb     %[ftmp5],   %[ftmp5],           %[ff_pb_04]         \n\t"
+
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp7],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp7],   %[ftmp7],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp7]            \n\t"
+    "psubsb     %[ftmp3],   %[ftmp3],           %[ftmp0]            \n\t"
+    "xor        %[ftmp3],   %[ftmp3],           %[ff_pb_80]         \n\t"
+    "psubsb     %[ftmp5],   %[ftmp5],           %[ff_pb_01]         \n\t"
+
+    "li         %[tmp0],    0x03                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psllh      %[ftmp0],   %[ftmp5],           %[ftmp8]            \n\t"
+    "psrah      %[ftmp0],   %[ftmp0],           %[ftmp9]            \n\t"
+    "psrlh      %[ftmp0],   %[ftmp0],           %[ftmp8]            \n\t"
+
+    "li         %[tmp0],    0x0b                                    \n\t"
+    "mtc1       %[tmp0],    %[ftmp9]                                \n\t"
+    "psrah      %[ftmp5],   %[ftmp5],           %[ftmp9]            \n\t"
+    "psllh      %[ftmp5],   %[ftmp5],           %[ftmp8]            \n\t"
+    "or         %[ftmp0],   %[ftmp0],           %[ftmp5]            \n\t"
+    "paddsb     %[ftmp6],   %[ftmp6],           %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],           %[ff_pb_80]         \n\t"
+
+    "ldc1       %[ftmp0],   0x00(%[srct])                           \n\t"
+    "ldc1       %[ftmp4],   0x08(%[srct])                           \n\t"
+
+    "punpckhbh  %[ftmp1],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp0],   %[ftmp0],           %[ftmp6]            \n\t"
+    "punpcklbh  %[ftmp2],   %[ftmp3],           %[ftmp4]            \n\t"
+    "punpckhbh  %[ftmp3],   %[ftmp3],           %[ftmp4]            \n\t"
+
+    "punpckhhw  %[ftmp6],   %[ftmp0],           %[ftmp2]            \n\t"
+    "punpcklhw  %[ftmp0],   %[ftmp0],           %[ftmp2]            \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp1],           %[ftmp3]            \n\t"
+    "punpcklhw  %[ftmp1],   %[ftmp1],           %[ftmp3]            \n\t"
+
+    "dsrl       %[ftmp0],   %[ftmp0],           %[ftmp10]           \n\t"
+    MMI_SUBU(%[addr1], %[addr0], %[src_pixel_step_x4])
+    "gsswlc1    %[ftmp0],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp0],   0x00(%[addr1])                          \n\t"
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp6],   %[ftmp6],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[src_ptr])                        \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[src_ptr])                        \n\t"
+
+    MMI_SUBU(%[addr1], %[src_ptr], %[src_pixel_step])
+    "gsswlc1    %[ftmp6],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp6],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[addr1], %[src_ptr], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    "dsrl       %[ftmp1],   %[ftmp1],           %[ftmp10]           \n\t"
+    "gsswlc1    %[ftmp1],   0x03(%[addr0])                          \n\t"
+    "gsswrc1    %[ftmp1],   0x00(%[addr0])                          \n\t"
+
+    "dsrl       %[ftmp5],   %[ftmp5],           %[ftmp10]           \n\t"
+    MMI_ADDU(%[addr1], %[addr0], %[src_pixel_step_x2])
+    "gsswlc1    %[ftmp5],   0x03(%[addr1])                          \n\t"
+    "gsswrc1    %[ftmp5],   0x00(%[addr1])                          \n\t"
+
+    MMI_ADDU(%[src_ptr], %[src_ptr], %[src_pixel_step_x8])
+    "addiu      %[count],   %[count],           -0x01               \n\t"
+    "bnez       %[count],   1b                                      \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),              [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),              [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),              [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),              [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),              [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),            [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [addr1]"=&r"(addr[1]),
+      [src_ptr]"+&r"(src_ptr),          [count]"+&r"(count)
+    : [blimit]"r"(blimit),              [srct]"r"(srct),
+      [src_pixel_step]"r"((mips_reg)src_pixel_step),
+      [src_pixel_step_x2]"r"((mips_reg)(src_pixel_step<<1)),
+      [src_pixel_step_x4]"r"((mips_reg)(src_pixel_step<<2)),
+      [src_pixel_step_x8]"r"((mips_reg)(src_pixel_step<<3)),
+      [ff_pb_fe]"f"(ff_pb_fe),          [ff_pb_80]"f"(ff_pb_80),
+      [ff_pb_04]"f"(ff_pb_04),          [ff_pb_01]"f"(ff_pb_01)
+    : "memory"
+  );
+}
+
+/* Horizontal MB filtering */
+void vp8_loop_filter_mbh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_horizontal_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(u_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_horizontal_edge_mmi(v_ptr, uv_stride, lfi->mblim,
+                                          lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical MB Filtering */
+void vp8_loop_filter_mbv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                             unsigned char *v_ptr, int y_stride, int uv_stride,
+                             loop_filter_info *lfi) {
+  vp8_mbloop_filter_vertical_edge_mmi(y_ptr, y_stride, lfi->mblim, lfi->lim,
+                                      lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(u_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_mbloop_filter_vertical_edge_mmi(v_ptr, uv_stride, lfi->mblim, lfi->lim,
+                                        lfi->hev_thr, 1);
+}
+
+/* Horizontal B Filtering */
+void vp8_loop_filter_bh_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride, lfi->blim,
+                                      lfi->lim, lfi->hev_thr, 2);
+  vp8_loop_filter_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                      lfi->blim, lfi->lim, lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(u_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_horizontal_edge_mmi(v_ptr + 4 * uv_stride, uv_stride,
+                                        lfi->blim, lfi->lim, lfi->hev_thr, 1);
+}
+
+/* Vertical B Filtering */
+void vp8_loop_filter_bv_mmi(unsigned char *y_ptr, unsigned char *u_ptr,
+                            unsigned char *v_ptr, int y_stride, int uv_stride,
+                            loop_filter_info *lfi) {
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 4, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 8, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+  vp8_loop_filter_vertical_edge_mmi(y_ptr + 12, y_stride, lfi->blim, lfi->lim,
+                                    lfi->hev_thr, 2);
+
+  if (u_ptr)
+    vp8_loop_filter_vertical_edge_mmi(u_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+
+  if (v_ptr)
+    vp8_loop_filter_vertical_edge_mmi(v_ptr + 4, uv_stride, lfi->blim, lfi->lim,
+                                      lfi->hev_thr, 1);
+}
+
+void vp8_loop_filter_bhs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 4 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 8 * y_stride, y_stride,
+                                             blimit);
+  vp8_loop_filter_simple_horizontal_edge_mmi(y_ptr + 12 * y_stride, y_stride,
+                                             blimit);
+}
+
+void vp8_loop_filter_bvs_mmi(unsigned char *y_ptr, int y_stride,
+                             const unsigned char *blimit) {
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 4, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 8, y_stride, blimit);
+  vp8_loop_filter_simple_vertical_edge_mmi(y_ptr + 12, y_stride, blimit);
+}
diff --git a/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
new file mode 100644
index 000000000..77d665d45
--- /dev/null
+++ b/libvpx/vp8/common/mips/mmi/sixtap_filter_mmi.c
@@ -0,0 +1,416 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/filter.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+DECLARE_ALIGNED(8, static const int16_t, vp8_six_tap_mmi[8][6 * 8]) = {
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080, 0x0080,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d, 0x004d,
+    0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0, 0xfff0,
+    0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003, 0x0003 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032, 0x0032,
+    0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d, 0x005d,
+    0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7, 0xfff7,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 },
+  { 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001, 0x0001,
+    0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8, 0xfff8,
+    0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024, 0x0024,
+    0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c, 0x006c,
+    0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5, 0xfff5,
+    0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002, 0x0002 },
+  { 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c, 0x000c,
+    0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b, 0x007b,
+    0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa, 0xfffa,
+    0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }
+};
+
+/* Horizontal filter:  pixel_step is 1, output_height and output_width are
+   the size of horizontal filtering output, output_height is always H + 5 */
+static INLINE void vp8_filter_block1d_h6_mmi(unsigned char *src_ptr,
+                                             uint16_t *output_ptr,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const int16_t *vp8_filter) {
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "ldc1       %[ftmp0],       0x00(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp1],       0x10(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp2],       0x20(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp3],       0x30(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp4],       0x40(%[vp8_filter])                   \n\t"
+    "ldc1       %[ftmp5],       0x50(%[vp8_filter])                   \n\t"
+    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
+    "li         %[tmp0],        0x07                                  \n\t"
+    "mtc1       %[tmp0],        %[ftmp7]                              \n\t"
+    "li         %[tmp0],        0x08                                  \n\t"
+    "mtc1       %[tmp0],        %[ftmp11]                             \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp9],       0x05(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp9],       -0x02(%[src_ptr])                     \n\t"
+    "gsldlc1    %[ftmp10],      0x06(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp10],      -0x01(%[src_ptr])                     \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp8],       %[ftmp6],          %[ftmp0]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp9],          %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp4]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp1]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "punpckhbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp5]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp2]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "dsrl       %[ftmp10],      %[ftmp10],         %[ftmp11]          \n\t"
+    "punpcklbh  %[ftmp6],       %[ftmp10],         %[fzero]           \n\t"
+    "pmullh     %[ftmp6],       %[ftmp6],          %[ftmp3]           \n\t"
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ftmp6]           \n\t"
+
+    "paddsh     %[ftmp8],       %[ftmp8],          %[ff_ph_40]        \n\t"
+    "psrah      %[ftmp8],       %[ftmp8],          %[ftmp7]           \n\t"
+    "packushb   %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "punpcklbh  %[ftmp8],       %[ftmp8],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp8],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp8],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [tmp0]"=&r"(tmp[0]),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height),
+      [src_ptr]"+&r"(src_ptr)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [vp8_filter]"r"(vp8_filter),      [output_width]"r"(output_width),
+      [ff_ph_40]"f"(ff_ph_40)
+    : "memory"
+    );
+}
+
+/* Horizontal filter:  pixel_step is always W */
+static INLINE void vp8_filter_block1dc_v6_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line, const int16_t *vp8_filter) {
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  uint32_t tmp[1];
+  mips_reg addr[1];
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+  register double ftmp2 asm("$f6");
+  register double ftmp3 asm("$f8");
+  register double ftmp4 asm("$f10");
+  register double ftmp5 asm("$f12");
+  register double ftmp6 asm("$f14");
+  register double ftmp7 asm("$f16");
+  register double ftmp8 asm("$f18");
+  register double ftmp9 asm("$f20");
+  register double ftmp10 asm("$f22");
+  register double ftmp11 asm("$f24");
+  register double ftmp12 asm("$f26");
+  register double ftmp13 asm("$f28");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f3");
+  register double ftmp3 asm("$f4");
+  register double ftmp4 asm("$f5");
+  register double ftmp5 asm("$f6");
+  register double ftmp6 asm("$f7");
+  register double ftmp7 asm("$f8");
+  register double ftmp8 asm("$f9");
+  register double ftmp9 asm("$f10");
+  register double ftmp10 asm("$f11");
+  register double ftmp11 asm("$f12");
+  register double ftmp12 asm("$f13");
+  register double ftmp13 asm("$f14");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "ldc1       %[ftmp0],     0x00(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp1],     0x10(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp2],     0x20(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp3],     0x30(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp4],     0x40(%[vp8_filter])                     \n\t"
+    "ldc1       %[ftmp5],     0x50(%[vp8_filter])                     \n\t"
+    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
+    "li         %[tmp0],      0x07                                    \n\t"
+    "mtc1       %[tmp0],      %[ftmp13]                               \n\t"
+
+    /* In order to make full use of memory load delay slot,
+     * Operation of memory loading and calculating has been rearranged.
+     */
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp6],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp6],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line])
+    "gsldlc1    %[ftmp7],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp7],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp8],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp8],     0x00(%[addr0])                          \n\t"
+
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp9],     0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp9],     0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x2])
+    "gsldlc1    %[ftmp10],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp10],    0x00(%[addr0])                          \n\t"
+    MMI_ADDU(%[addr0],     %[src_ptr],      %[pixels_per_line_x4])
+    "gsldlc1    %[ftmp11],    0x07(%[addr0])                          \n\t"
+    "gsldrc1    %[ftmp11],    0x00(%[addr0])                          \n\t"
+
+    "pmullh     %[ftmp12],    %[ftmp6],        %[ftmp0]               \n\t"
+
+    "pmullh     %[ftmp7],     %[ftmp7],        %[ftmp1]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp7]               \n\t"
+
+    "pmullh     %[ftmp8],     %[ftmp8],        %[ftmp2]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp8]               \n\t"
+
+    "pmullh     %[ftmp9],     %[ftmp9],        %[ftmp4]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp9]               \n\t"
+
+    "pmullh     %[ftmp10],    %[ftmp10],       %[ftmp3]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp10]              \n\t"
+
+    "pmullh     %[ftmp11],    %[ftmp11],       %[ftmp5]               \n\t"
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ftmp11]              \n\t"
+
+    "paddsh     %[ftmp12],    %[ftmp12],       %[ff_ph_40]            \n\t"
+    "psrah      %[ftmp12],    %[ftmp12],       %[ftmp13]              \n\t"
+    "packushb   %[ftmp12],    %[ftmp12],       %[fzero]               \n\t"
+    "gsswlc1    %[ftmp12],    0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp12],    0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [ftmp2]"=&f"(ftmp2),
+      [ftmp3]"=&f"(ftmp3),              [ftmp4]"=&f"(ftmp4),
+      [ftmp5]"=&f"(ftmp5),              [ftmp6]"=&f"(ftmp6),
+      [ftmp7]"=&f"(ftmp7),              [ftmp8]"=&f"(ftmp8),
+      [ftmp9]"=&f"(ftmp9),              [ftmp10]"=&f"(ftmp10),
+      [ftmp11]"=&f"(ftmp11),            [ftmp12]"=&f"(ftmp12),
+      [ftmp13]"=&f"(ftmp13),            [tmp0]"=&r"(tmp[0]),
+      [addr0]"=&r"(addr[0]),            [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [pixels_per_line_x2]"r"((mips_reg)(pixels_per_line<<1)),
+      [pixels_per_line_x4]"r"((mips_reg)(pixels_per_line<<2)),
+      [vp8_filter]"r"(vp8_filter),
+      [output_pitch]"r"((mips_reg)output_pitch),
+      [ff_ph_40]"f"(ff_ph_40)
+    : "memory"
+    );
+}
+
+/* When xoffset == 0, vp8_filter= {0,0,128,0,0,0},
+   function vp8_filter_block1d_h6_mmi and vp8_filter_block1d_v6_mmi can
+   be simplified */
+static INLINE void vp8_filter_block1d_h6_filter0_mmi(
+    unsigned char *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int output_height,
+    unsigned int output_width) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "xor        %[fzero],       %[fzero],           %[fzero]          \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],       0x07(%[src_ptr])                      \n\t"
+    "gsldrc1    %[ftmp0],       0x00(%[src_ptr])                      \n\t"
+    MMI_ADDU(%[src_ptr],  %[src_ptr], %[src_pixels_per_line])
+
+    "punpcklbh  %[ftmp1],       %[ftmp0],          %[fzero]           \n\t"
+    "gssdlc1    %[ftmp1],       0x07(%[output_ptr])                   \n\t"
+    "gssdrc1    %[ftmp1],       0x00(%[output_ptr])                   \n\t"
+
+    "addiu      %[output_height], %[output_height], -0x01             \n\t"
+    MMI_ADDU(%[output_ptr],  %[output_ptr],    %[output_width])
+    "bnez       %[output_height],               1b                    \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [src_pixels_per_line]"r"((mips_reg)src_pixels_per_line),
+      [output_width]"r"(output_width)
+    : "memory"
+    );
+}
+
+static INLINE void vp8_filter_block1dc_v6_filter0_mmi(
+    uint16_t *src_ptr, unsigned char *output_ptr, unsigned int output_height,
+    int output_pitch, unsigned int pixels_per_line) {
+#if _MIPS_SIM == _ABIO32
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f2");
+  register double ftmp1 asm("$f4");
+#else
+  register double fzero asm("$f0");
+  register double ftmp0 asm("$f1");
+  register double ftmp1 asm("$f2");
+#endif  // _MIPS_SIM == _ABIO32
+
+  __asm__ volatile (
+    "xor        %[fzero],     %[fzero],        %[fzero]               \n\t"
+
+    "1:                                                               \n\t"
+    "gsldlc1    %[ftmp0],     0x07(%[src_ptr])                        \n\t"
+    "gsldrc1    %[ftmp0],     0x00(%[src_ptr])                        \n\t"
+    MMI_ADDU(%[src_ptr],   %[src_ptr],      %[pixels_per_line])
+    MMI_ADDIU(%[output_height], %[output_height], -0x01)
+    "packushb   %[ftmp1],     %[ftmp0],        %[fzero]               \n\t"
+    "gsswlc1    %[ftmp1],     0x03(%[output_ptr])                     \n\t"
+    "gsswrc1    %[ftmp1],     0x00(%[output_ptr])                     \n\t"
+
+    MMI_ADDU(%[output_ptr], %[output_ptr], %[output_pitch])
+    "bnez       %[output_height], 1b                                  \n\t"
+    : [fzero]"=&f"(fzero),              [ftmp0]"=&f"(ftmp0),
+      [ftmp1]"=&f"(ftmp1),              [src_ptr]"+&r"(src_ptr),
+      [output_ptr]"+&r"(output_ptr),    [output_height]"+&r"(output_height)
+    : [pixels_per_line]"r"((mips_reg)pixels_per_line),
+      [output_pitch]"r"((mips_reg)output_pitch)
+    : "memory"
+    );
+}
+
+#define sixtapNxM(n, m)                                                        \
+  void vp8_sixtap_predict##n##x##m##_mmi(                                      \
+      unsigned char *src_ptr, int src_pixels_per_line, int xoffset,            \
+      int yoffset, unsigned char *dst_ptr, int dst_pitch) {                    \
+    DECLARE_ALIGNED(16, uint16_t,                                              \
+                    FData2[(n + 5) * (n == 16 ? 24 : (n == 8 ? 16 : n))]);     \
+    const int16_t *HFilter, *VFilter;                                          \
+    int i, loop = n / 4;                                                       \
+    HFilter = vp8_six_tap_mmi[xoffset];                                        \
+    VFilter = vp8_six_tap_mmi[yoffset];                                        \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_filter0_mmi(                                     \
+            src_ptr - (2 * src_pixels_per_line) + i * 4, FData2 + i * 4,       \
+            src_pixels_per_line, m + 5, n * 2);                                \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1d_h6_mmi(src_ptr - (2 * src_pixels_per_line) + i * 4, \
+                                  FData2 + i * 4, src_pixels_per_line, m + 5,  \
+                                  n * 2, HFilter);                             \
+      }                                                                        \
+    }                                                                          \
+    if (yoffset == 0) {                                                        \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_filter0_mmi(                                    \
+            FData2 + n * 2 + i * 4, dst_ptr + i * 4, m, dst_pitch, n * 2);     \
+      }                                                                        \
+    } else {                                                                   \
+      for (i = 0; i < loop; ++i) {                                             \
+        vp8_filter_block1dc_v6_mmi(FData2 + i * 4, dst_ptr + i * 4, m,         \
+                                   dst_pitch, n * 2, VFilter);                 \
+      }                                                                        \
+    }                                                                          \
+  }
+
+sixtapNxM(4, 4);
+sixtapNxM(8, 8);
+sixtapNxM(8, 4);
+sixtapNxM(16, 16);
diff --git a/libvpx/vp8/common/onyxd.h b/libvpx/vp8/common/onyxd.h
index cc2cb8089..d3c1b0e97 100644
--- a/libvpx/vp8/common/onyxd.h
+++ b/libvpx/vp8/common/onyxd.h
@@ -22,6 +22,7 @@ extern "C" {
 #include "vpx/vp8.h"
 
 struct VP8D_COMP;
+struct VP8Common;
 
 typedef struct {
   int Width;
@@ -45,6 +46,7 @@ int vp8dx_receive_compressed_data(struct VP8D_COMP *comp, size_t size,
 int vp8dx_get_raw_frame(struct VP8D_COMP *comp, YV12_BUFFER_CONFIG *sd,
                         int64_t *time_stamp, int64_t *time_end_stamp,
                         vp8_ppflags_t *flags);
+int vp8dx_references_buffer(struct VP8Common *oci, int ref_frame);
 
 vpx_codec_err_t vp8dx_get_reference(struct VP8D_COMP *comp,
                                     enum vpx_ref_frame_type ref_frame_flag,
diff --git a/libvpx/vp8/common/reconintra.c b/libvpx/vp8/common/reconintra.c
index 986074ec7..8e2094da8 100644
--- a/libvpx/vp8/common/reconintra.c
+++ b/libvpx/vp8/common/reconintra.c
@@ -71,8 +71,16 @@ void vp8_build_intra_predictors_mbuv_s(
     unsigned char *uleft, unsigned char *vleft, int left_stride,
     unsigned char *upred_ptr, unsigned char *vpred_ptr, int pred_stride) {
   MB_PREDICTION_MODE uvmode = x->mode_info_context->mbmi.uv_mode;
+#if HAVE_VSX
+  /* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+     uleft_col and vleft_col. Play it safe by reserving enough stack
+     space here. */
+  unsigned char uleft_col[16];
+  unsigned char vleft_col[16];
+#else
   unsigned char uleft_col[8];
   unsigned char vleft_col[8];
+#endif
   int i;
   intra_pred_fn fn;
 
diff --git a/libvpx/vp8/common/reconintra4x4.c b/libvpx/vp8/common/reconintra4x4.c
index 7852cf9da..64d33a287 100644
--- a/libvpx/vp8/common/reconintra4x4.c
+++ b/libvpx/vp8/common/reconintra4x4.c
@@ -40,7 +40,15 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
                           int left_stride, B_PREDICTION_MODE b_mode,
                           unsigned char *dst, int dst_stride,
                           unsigned char top_left) {
-  unsigned char Aboveb[12], *Above = Aboveb + 4;
+/* Power PC implementation uses "vec_vsx_ld" to read 16 bytes from
+   Above (aka, Aboveb + 4). Play it safe by reserving enough stack
+   space here. Similary for "Left". */
+#if HAVE_VSX
+  unsigned char Aboveb[20];
+#else
+  unsigned char Aboveb[12];
+#endif
+  unsigned char *Above = Aboveb + 4;
 #if HAVE_NEON
   // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it
   // over reads but does not use the extra 4 values.
@@ -50,6 +58,8 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft,
   // indeed read, they are not used.
   vp8_zero_array(Left, 8);
 #endif  // VPX_WITH_ASAN
+#elif HAVE_VSX
+  unsigned char Left[16];
 #else
   unsigned char Left[4];
 #endif  // HAVE_NEON
diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl
index bc5e05799..3df745f75 100644
--- a/libvpx/vp8/common/rtcd_defs.pl
+++ b/libvpx/vp8/common/rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp8_common_forward_decls() {
 print <<EOF
 /*
@@ -22,67 +32,71 @@ forward_decls qw/vp8_common_forward_decls/;
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx neon msa/;
+specialize qw/vp8_dequantize_b mmx neon msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa/;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c;
 $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2;
 $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon;
 $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa;
+$vp8_loop_filter_simple_mbv_mmi=vp8_loop_filter_simple_vertical_edge_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_mbh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c;
 $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2;
 $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon;
 $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa;
+$vp8_loop_filter_simple_mbh_mmi=vp8_loop_filter_simple_horizontal_edge_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bv sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bv sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c;
 $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2;
 $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon;
 $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa;
+$vp8_loop_filter_simple_bv_mmi=vp8_loop_filter_bvs_mmi;
 
 add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit";
-specialize qw/vp8_loop_filter_simple_bh sse2 neon msa/;
+specialize qw/vp8_loop_filter_simple_bh sse2 neon msa mmi/;
 $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c;
 $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2;
 $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon;
 $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa;
+$vp8_loop_filter_simple_bh_mmi=vp8_loop_filter_bhs_mmi;
 
 #
 # IDCT
 #
 #idct16
 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/;
+specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa mmi/;
 
 #iwalsh1
 add_proto qw/void vp8_short_inv_walsh4x4_1/, "short *input, short *output";
@@ -90,23 +104,23 @@ specialize qw/vp8_short_inv_walsh4x4_1 dspr2/;
 
 #iwalsh16
 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa/;
+specialize qw/vp8_short_inv_walsh4x4 sse2 neon dspr2 msa mmi/;
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
 
 #
 # RECON
 #
 add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa/;
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/;
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/;
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
 
 #
 # Postproc
@@ -132,16 +146,16 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
@@ -172,22 +186,22 @@ if ($opts{arch} =~ /x86/) {
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
 
 #
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;
+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
 
 #
 # Block subtraction
diff --git a/libvpx/vp8/common/threading.h b/libvpx/vp8/common/threading.h
index ece64f3fb..b082bf109 100644
--- a/libvpx/vp8/common/threading.h
+++ b/libvpx/vp8/common/threading.h
@@ -191,47 +191,18 @@ static inline int sem_destroy(sem_t *sem) {
 #define x86_pause_hint()
 #endif
 
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define USE_MUTEX_LOCK 1
-#endif
-#endif
-
 #include "vpx_util/vpx_thread.h"
+#include "vpx_util/vpx_atomics.h"
 
-static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) {
-  (void)mutex;
-#if defined(USE_MUTEX_LOCK)
-  int ret;
-  pthread_mutex_lock(mutex);
-  ret = *p;
-  pthread_mutex_unlock(mutex);
-  return ret;
-#endif
-  return *p;
-}
-
-static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col,
-                             const int *last_row_current_mb_col,
-                             const int nsync) {
-  while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) {
+static INLINE void vp8_atomic_spin_wait(
+    int mb_col, const vpx_atomic_int *last_row_current_mb_col,
+    const int nsync) {
+  while (mb_col > (vpx_atomic_load_acquire(last_row_current_mb_col) - nsync)) {
     x86_pause_hint();
     thread_sleep(0);
   }
 }
 
-static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) {
-  (void)mutex;
-#if defined(USE_MUTEX_LOCK)
-  pthread_mutex_lock(mutex);
-  *p = v;
-  pthread_mutex_unlock(mutex);
-  return;
-#endif
-  *p = v;
-}
-
-#undef USE_MUTEX_LOCK
 #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
 
 #ifdef __cplusplus
diff --git a/libvpx/vp8/common/vp8_loopfilter.c b/libvpx/vp8/common/vp8_loopfilter.c
index c6430be46..9fb125065 100644
--- a/libvpx/vp8/common/vp8_loopfilter.c
+++ b/libvpx/vp8/common/vp8_loopfilter.c
@@ -111,11 +111,9 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      /* Abs value */
       if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
-      } else /* Delta Value */
-      {
+      } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       }
       lvl_seg = (lvl_seg > 0) ? ((lvl_seg > 63) ? 63 : lvl_seg) : 0;
@@ -344,8 +342,7 @@ void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int frame_type) {
 
       mode_info_context++; /* Skip border mb */
     }
-  } else /* SIMPLE_LOOPFILTER */
-  {
+  } else { /* SIMPLE_LOOPFILTER */
     for (mb_row = 0; mb_row < mb_rows; ++mb_row) {
       for (mb_col = 0; mb_col < mb_cols; ++mb_col) {
         int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
diff --git a/libvpx/vp8/common/vp8_skin_detection.c b/libvpx/vp8/common/vp8_skin_detection.c
new file mode 100644
index 000000000..6739efa5f
--- /dev/null
+++ b/libvpx/vp8/common/vp8_skin_detection.c
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp8/common/alloccommon.h"
+#include "vp8/common/vp8_skin_detection.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+static int avg_2x2(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 2; ++i, s += p) {
+    for (j = 0; j < 2; ++j) {
+      sum += s[j];
+    }
+  }
+  return (sum + 2) >> 2;
+}
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn) {
+  // No skin if block has been zero/small motion for long consecutive time.
+  if (consec_zeromv > 60 && curr_motion_magn == 0) {
+    return 0;
+  } else {
+    int motion = 1;
+    if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
+    if (bsize == SKIN_16X16) {
+      // Take the average of center 2x2 pixels.
+      const int ysource = avg_2x2(y + 7 * stride + 7, stride);
+      const int usource = avg_2x2(u + 3 * strideuv + 3, strideuv);
+      const int vsource = avg_2x2(v + 3 * strideuv + 3, strideuv);
+      return vpx_skin_pixel(ysource, usource, vsource, motion);
+    } else {
+      int num_skin = 0;
+      int i, j;
+      for (i = 0; i < 2; i++) {
+        for (j = 0; j < 2; j++) {
+          // Take the average of center 2x2 pixels.
+          const int ysource = avg_2x2(y + 3 * stride + 3, stride);
+          const int usource = avg_2x2(u + strideuv + 1, strideuv);
+          const int vsource = avg_2x2(v + strideuv + 1, strideuv);
+          num_skin += vpx_skin_pixel(ysource, usource, vsource, motion);
+          if (num_skin >= 2) return 1;
+          y += 8;
+          u += 4;
+          v += 4;
+        }
+        y += (stride << 3) - 16;
+        u += (strideuv << 2) - 8;
+        v += (strideuv << 2) - 8;
+      }
+
+      return 0;
+    }
+  }
+}
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(VP8_COMP *const cpi, FILE *yuv_skinmap_file) {
+  int i, j, mb_row, mb_col, num_bl;
+  VP8_COMMON *const cm = &cpi->common;
+  uint8_t *y;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  int offset = 0;
+
+  YV12_BUFFER_CONFIG skinmap;
+  memset(&skinmap, 0, sizeof(skinmap));
+  if (vp8_yv12_alloc_frame_buffer(&skinmap, cm->Width, cm->Height,
+                                  VP8BORDERINPIXELS) < 0) {
+    vpx_free_frame_buffer(&skinmap);
+    return;
+  }
+  memset(skinmap.buffer_alloc, 128, skinmap.frame_size);
+  y = skinmap.y_buffer;
+  // Loop through blocks and set skin map based on center pixel of block.
+  // Set y to white for skin block, otherwise set to source with gray scale.
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row += 1) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col += 1) {
+      const int is_skin = cpi->skin_map[offset++];
+      for (i = 0; i < 16; i++) {
+        for (j = 0; j < 16; j++) {
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
+        }
+      }
+      num_bl++;
+      y += 16;
+      src_y += 16;
+    }
+    y += (src_ystride << 4) - (num_bl << 4);
+    src_y += (src_ystride << 4) - (num_bl << 4);
+  }
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
+  vpx_free_frame_buffer(&skinmap);
+}
+#endif  // OUTPUT_YUV_SKINMAP
diff --git a/libvpx/vp8/common/vp8_skin_detection.h b/libvpx/vp8/common/vp8_skin_detection.h
new file mode 100644
index 000000000..4d27f5eb2
--- /dev/null
+++ b/libvpx/vp8/common/vp8_skin_detection.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_SKIN_DETECTION_H_
+#define VP8_COMMON_SKIN_DETECTION_H_
+
+#include "vp8/encoder/onyx_int.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+typedef enum {
+  // Skin detection based on 8x8 block. If two of them are identified as skin,
+  // the macroblock is marked as skin.
+  SKIN_8X8,
+  // Skin detection based on 16x16 block.
+  SKIN_16X16
+} SKIN_DETECTION_BLOCK_SIZE;
+
+int vp8_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
+                           int stride, int strideuv,
+                           SKIN_DETECTION_BLOCK_SIZE bsize, int consec_zeromv,
+                           int curr_motion_magn);
+
+#ifdef OUTPUT_YUV_SKINMAP
+// For viewing skin map on input source.
+void vp8_compute_skin_map(struct VP8_COMP *const cpi, FILE *yuv_skinmap_file);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP8_COMMON_SKIN_DETECTION_H_
diff --git a/libvpx/vp8/common/x86/copy_sse2.asm b/libvpx/vp8/common/x86/copy_sse2.asm
index 86fae2695..480faa255 100644
--- a/libvpx/vp8/common/x86/copy_sse2.asm
+++ b/libvpx/vp8/common/x86/copy_sse2.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_copy32xn_sse2(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/copy_sse3.asm b/libvpx/vp8/common/x86/copy_sse3.asm
index d789a40cc..31ea898a3 100644
--- a/libvpx/vp8/common/x86/copy_sse3.asm
+++ b/libvpx/vp8/common/x86/copy_sse3.asm
@@ -83,6 +83,7 @@
     ret
 %endmacro
 
+SECTION .text
 
 ;void vp8_copy32xn_sse3(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/dequantize_mmx.asm b/libvpx/vp8/common/x86/dequantize_mmx.asm
index 4e551f00a..bfdd99778 100644
--- a/libvpx/vp8/common/x86/dequantize_mmx.asm
+++ b/libvpx/vp8/common/x86/dequantize_mmx.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
 global sym(vp8_dequantize_b_impl_mmx) PRIVATE
diff --git a/libvpx/vp8/common/x86/idctllm_mmx.asm b/libvpx/vp8/common/x86/idctllm_mmx.asm
index 96fa2c60d..5773d9d84 100644
--- a/libvpx/vp8/common/x86/idctllm_mmx.asm
+++ b/libvpx/vp8/common/x86/idctllm_mmx.asm
@@ -31,6 +31,7 @@
 ; *
 ; **************************************************************************/
 
+SECTION .text
 
 ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
 ;int pitch, unsigned char *dest,int stride)
diff --git a/libvpx/vp8/common/x86/idctllm_sse2.asm b/libvpx/vp8/common/x86/idctllm_sse2.asm
index bf8e2c402..560faba00 100644
--- a/libvpx/vp8/common/x86/idctllm_sse2.asm
+++ b/libvpx/vp8/common/x86/idctllm_sse2.asm
@@ -19,6 +19,8 @@
 ;   int dst_stride      - 3
 ; )
 
+SECTION .text
+
 global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE
 sym(vp8_idct_dequant_0_2x_sse2):
     push        rbp
diff --git a/libvpx/vp8/common/x86/iwalsh_sse2.asm b/libvpx/vp8/common/x86/iwalsh_sse2.asm
index 06e86a80b..82d7bf91a 100644
--- a/libvpx/vp8/common/x86/iwalsh_sse2.asm
+++ b/libvpx/vp8/common/x86/iwalsh_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_short_inv_walsh4x4_sse2(short *input, short *output)
 global sym(vp8_short_inv_walsh4x4_sse2) PRIVATE
 sym(vp8_short_inv_walsh4x4_sse2):
diff --git a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
index 6d5aaa19d..6a3d05290 100644
--- a/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
+++ b/libvpx/vp8/common/x86/loopfilter_block_sse2_x86_64.asm
@@ -125,6 +125,8 @@
         pxor        %1, [GLOBAL(t80)]
 %endmacro
 
+SECTION .text
+
 ;void vp8_loop_filter_bh_y_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vp8/common/x86/loopfilter_sse2.asm b/libvpx/vp8/common/x86/loopfilter_sse2.asm
index 1913abc69..2ae028fea 100644
--- a/libvpx/vp8/common/x86/loopfilter_sse2.asm
+++ b/libvpx/vp8/common/x86/loopfilter_sse2.asm
@@ -276,6 +276,8 @@
 
 %endmacro
 
+SECTION .text
+
 %if ABI_IS_32BIT
 
 ;void vp8_loop_filter_horizontal_edge_sse2
diff --git a/libvpx/vp8/common/x86/mfqe_sse2.asm b/libvpx/vp8/common/x86/mfqe_sse2.asm
index 8177b7922..3fde973ad 100644
--- a/libvpx/vp8/common/x86/mfqe_sse2.asm
+++ b/libvpx/vp8/common/x86/mfqe_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
diff --git a/libvpx/vp8/common/x86/recon_mmx.asm b/libvpx/vp8/common/x86/recon_mmx.asm
index 43f2dc6c6..e6a48f6b0 100644
--- a/libvpx/vp8/common/x86/recon_mmx.asm
+++ b/libvpx/vp8/common/x86/recon_mmx.asm
@@ -11,6 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
 
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
diff --git a/libvpx/vp8/common/x86/recon_sse2.asm b/libvpx/vp8/common/x86/recon_sse2.asm
index cb89537f7..57f8899c7 100644
--- a/libvpx/vp8/common/x86/recon_sse2.asm
+++ b/libvpx/vp8/common/x86/recon_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
diff --git a/libvpx/vp8/common/x86/subpixel_mmx.asm b/libvpx/vp8/common/x86/subpixel_mmx.asm
index 6ab7f1fdc..1f3a2baca 100644
--- a/libvpx/vp8/common/x86/subpixel_mmx.asm
+++ b/libvpx/vp8/common/x86/subpixel_mmx.asm
@@ -17,6 +17,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 %define vp8_filter_weight 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;void vp8_filter_block1d_h6_mmx
 ;(
diff --git a/libvpx/vp8/common/x86/subpixel_sse2.asm b/libvpx/vp8/common/x86/subpixel_sse2.asm
index ca00583ca..6e70f6d2e 100644
--- a/libvpx/vp8/common/x86/subpixel_sse2.asm
+++ b/libvpx/vp8/common/x86/subpixel_sse2.asm
@@ -16,6 +16,7 @@ extern sym(vp8_bilinear_filters_x86_8)
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libvpx/vp8/common/x86/subpixel_ssse3.asm b/libvpx/vp8/common/x86/subpixel_ssse3.asm
index 1f6cbd1d1..8d55c9320 100644
--- a/libvpx/vp8/common/x86/subpixel_ssse3.asm
+++ b/libvpx/vp8/common/x86/subpixel_ssse3.asm
@@ -15,6 +15,7 @@
 %define VP8_FILTER_WEIGHT 128
 %define VP8_FILTER_SHIFT  7
 
+SECTION .text
 
 ;/************************************************************************************
 ; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c
index 0aec2a01b..077bd3da2 100644
--- a/libvpx/vp8/decoder/decodeframe.c
+++ b/libvpx/vp8/decoder/decodeframe.c
@@ -930,7 +930,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
       /* When error concealment is enabled we should only check the sync
        * code if we have enough bits available
        */
-      if (!pbi->ec_active || data + 3 < data_end) {
+      if (data + 3 < data_end) {
         if (clear[0] != 0x9d || clear[1] != 0x01 || clear[2] != 0x2a) {
           vpx_internal_error(&pc->error, VPX_CODEC_UNSUP_BITSTREAM,
                              "Invalid frame sync code");
@@ -941,13 +941,19 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
        * if we have enough data. Otherwise we will end up with the wrong
        * size.
        */
-      if (!pbi->ec_active || data + 6 < data_end) {
+      if (data + 6 < data_end) {
         pc->Width = (clear[3] | (clear[4] << 8)) & 0x3fff;
         pc->horiz_scale = clear[4] >> 6;
         pc->Height = (clear[5] | (clear[6] << 8)) & 0x3fff;
         pc->vert_scale = clear[6] >> 6;
+        data += 7;
+      } else if (!pbi->ec_active) {
+        vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME,
+                           "Truncated key frame header");
+      } else {
+        /* Error concealment is active, clear the frame. */
+        data = data_end;
       }
-      data += 7;
     } else {
       memcpy(&xd->pre, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
       memcpy(&xd->dst, yv12_fb_new, sizeof(YV12_BUFFER_CONFIG));
@@ -1199,7 +1205,8 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   pbi->frame_corrupt_residual = 0;
 
 #if CONFIG_MULTITHREAD
-  if (pbi->b_multithreaded_rd && pc->multi_token_partition != ONE_PARTITION) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) &&
+      pc->multi_token_partition != ONE_PARTITION) {
     unsigned int thread;
     vp8mt_decode_mb_rows(pbi, xd);
     vp8_yv12_extend_frame_borders(yv12_fb_new);
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
index b946ab73d..8e9600c6d 100644
--- a/libvpx/vp8/decoder/decodemv.c
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "decodemv.h"
 #include "treereader.h"
 #include "vp8/common/entropymv.h"
 #include "vp8/common/entropymode.h"
@@ -64,8 +65,7 @@ static int read_mvcomponent(vp8_reader *r, const MV_CONTEXT *mvc) {
   const vp8_prob *const p = (const vp8_prob *)mvc;
   int x = 0;
 
-  if (vp8_read(r, p[mvpis_short])) /* Large */
-  {
+  if (vp8_read(r, p[mvpis_short])) { /* Large */
     int i = 0;
 
     do {
@@ -284,8 +284,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
                              MB_MODE_INFO *mbmi) {
   vp8_reader *const bc = &pbi->mbc[8];
   mbmi->ref_frame = (MV_REFERENCE_FRAME)vp8_read(bc, pbi->prob_intra);
-  if (mbmi->ref_frame) /* inter MB */
-  {
+  if (mbmi->ref_frame) { /* inter MB */
     enum { CNT_INTRA, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
     int cnt[4];
     int *cntx = cnt;
diff --git a/libvpx/vp8/decoder/onyxd_if.c b/libvpx/vp8/decoder/onyxd_if.c
index 789c2eeff..f516eb0c7 100644
--- a/libvpx/vp8/decoder/onyxd_if.c
+++ b/libvpx/vp8/decoder/onyxd_if.c
@@ -41,7 +41,6 @@
 #endif
 
 extern void vp8_init_loop_filter(VP8_COMMON *cm);
-extern void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
 static int get_free_fb(VP8_COMMON *cm);
 static void ref_cnt_fb(int *buf, int *idx, int new_idx);
 
diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h
index 88b1ff16b..5ecacdbb9 100644
--- a/libvpx/vp8/decoder/onyxd_int.h
+++ b/libvpx/vp8/decoder/onyxd_int.h
@@ -68,7 +68,7 @@ typedef struct VP8D_COMP {
 #if CONFIG_MULTITHREAD
   /* variable for threading */
 
-  int b_multithreaded_rd;
+  vpx_atomic_int b_multithreaded_rd;
   int max_threads;
   int current_mb_col_main;
   unsigned int decoding_thread_count;
@@ -76,9 +76,8 @@ typedef struct VP8D_COMP {
 
   int mt_baseline_filter_level[MAX_MB_SEGMENTS];
   int sync_range;
-  int *mt_current_mb_col; /* Each row remembers its already decoded column. */
-  pthread_mutex_t *pmutex;
-  pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */
+  /* Each row remembers its already decoded column. */
+  vpx_atomic_int *mt_current_mb_col;
 
   unsigned char **mt_yabove_row; /* mb_rows x width */
   unsigned char **mt_uabove_row;
@@ -119,6 +118,8 @@ typedef struct VP8D_COMP {
   void *decrypt_state;
 } VP8D_COMP;
 
+void vp8cx_init_de_quantizer(VP8D_COMP *pbi);
+void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 int vp8_decode_frame(VP8D_COMP *cpi);
 
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
index 9f7751988..d0213f75c 100644
--- a/libvpx/vp8/decoder/threading.c
+++ b/libvpx/vp8/decoder/threading.c
@@ -20,6 +20,7 @@
 #include "vp8/common/loopfilter.h"
 #include "vp8/common/extend.h"
 #include "vpx_ports/vpx_timer.h"
+#include "decoderthreading.h"
 #include "detokenize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "vp8/common/reconinter.h"
@@ -36,8 +37,6 @@
     memset((p), 0, (n) * sizeof(*(p)));                             \
   } while (0)
 
-void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
-
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
                                        MB_ROW_DEC *mbrd, int count) {
   VP8_COMMON *const pc = &pbi->common;
@@ -80,7 +79,8 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
   }
 
-  for (i = 0; i < pc->mb_rows; ++i) pbi->mt_current_mb_col[i] = -1;
+  for (i = 0; i < pc->mb_rows; ++i)
+    vpx_atomic_store_release(&pbi->mt_current_mb_col[i], -1);
 }
 
 static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -248,12 +248,13 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
 static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
                               int start_mb_row) {
-  const int *last_row_current_mb_col;
-  int *current_mb_col;
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col;
   int mb_row;
   VP8_COMMON *pc = &pbi->common;
   const int nsync = pbi->sync_range;
-  const int first_row_no_sync_above = pc->mb_cols + nsync;
+  const vpx_atomic_int first_row_no_sync_above =
+      VPX_ATOMIC_INIT(pc->mb_cols + nsync);
   int num_part = 1 << pbi->common.multi_token_partition;
   int last_mb_row = start_mb_row;
 
@@ -357,13 +358,11 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
 
     for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) {
       if (((mb_col - 1) % nsync) == 0) {
-        pthread_mutex_t *mutex = &pbi->pmutex[mb_row];
-        protected_write(mutex, current_mb_col, mb_col - 1);
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
       }
 
       if (mb_row && !(mb_col & (nsync - 1))) {
-        pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1];
-        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
 
       /* Distance of MB to the various image edges.
@@ -549,7 +548,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd,
     }
 
     /* last MB of row is ready just after extension is done */
-    protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+    vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
     ++xd->mode_info_context; /* skip prediction column */
     xd->up_available = 1;
@@ -569,10 +568,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break;
+    if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) break;
 
     if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) {
-      if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) {
+      if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd) == 0) {
         break;
       } else {
         MACROBLOCKD *xd = &mbrd->mbd;
@@ -590,9 +589,8 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   int core_count = 0;
   unsigned int ithread;
 
-  pbi->b_multithreaded_rd = 0;
+  vpx_atomic_init(&pbi->b_multithreaded_rd, 0);
   pbi->allocated_decoding_thread_count = 0;
-  pthread_mutex_init(&pbi->mt_mutex, NULL);
 
   /* limit decoding threads to the max number of token partitions */
   core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads;
@@ -603,7 +601,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
   }
 
   if (core_count > 1) {
-    pbi->b_multithreaded_rd = 1;
+    vpx_atomic_init(&pbi->b_multithreaded_rd, 1);
     pbi->decoding_thread_count = core_count - 1;
 
     CALLOC_ARRAY(pbi->h_decoding_thread, pbi->decoding_thread_count);
@@ -649,16 +647,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) {
 void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) {
   int i;
 
-  /* De-allocate mutex */
-  if (pbi->pmutex != NULL) {
-    for (i = 0; i < mb_rows; ++i) {
-      pthread_mutex_destroy(&pbi->pmutex[i]);
-    }
-
-    vpx_free(pbi->pmutex);
-    pbi->pmutex = NULL;
-  }
-
   vpx_free(pbi->mt_current_mb_col);
   pbi->mt_current_mb_col = NULL;
 
@@ -724,7 +712,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
   int i;
   int uv_width;
 
-  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows);
 
     /* our internal buffers are always multiples of 16 */
@@ -742,36 +730,33 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     uv_width = width >> 1;
 
-    /* Allocate mutex */
-    CHECK_MEM_ERROR(pbi->pmutex,
-                    vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows));
-    if (pbi->pmutex) {
-      for (i = 0; i < pc->mb_rows; ++i) {
-        pthread_mutex_init(&pbi->pmutex[i], NULL);
-      }
-    }
-
-    /* Allocate an int for each mb row. */
-    CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows);
+    /* Allocate a vpx_atomic_int for each mb row. */
+    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+                    vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
+    for (i = 0; i < pc->mb_rows; ++i)
+      vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
 
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (width + (VP8BORDERINPIXELS << 1))));
+      CHECK_MEM_ERROR(
+          pbi->mt_yabove_row[i],
+          vpx_memalign(
+              16, sizeof(unsigned char) * (width + (VP8BORDERINPIXELS << 1))));
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_uabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
-                      vpx_memalign(16, sizeof(unsigned char) *
-                                           (uv_width + VP8BORDERINPIXELS)));
+      CHECK_MEM_ERROR(
+          pbi->mt_vabove_row[i],
+          vpx_memalign(16,
+                       sizeof(unsigned char) * (uv_width + VP8BORDERINPIXELS)));
 
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
@@ -793,9 +778,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
 void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
   /* shutdown MB Decoding thread; */
-  if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) {
+  if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
     int i;
-    protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0);
+    vpx_atomic_store_release(&pbi->b_multithreaded_rd, 0);
 
     /* allow all threads to exit */
     for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) {
@@ -825,7 +810,6 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) {
 
     vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows);
   }
-  pthread_mutex_destroy(&pbi->mt_mutex);
 }
 
 void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) {
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 7086faae9..8cacb6450 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -500,8 +500,7 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
         }
 
         write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
-      } else /* inter coded */
-      {
+      } else { /* inter coded */
         int_mv best_mv;
         vp8_prob mv_ref_p[VP8_MVREFS - 1];
 
@@ -1416,7 +1415,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       pack_mb_row_tokens(cpi, &cpi->bc[1]);
     } else {
       vp8_pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
diff --git a/libvpx/vp8/encoder/bitstream.h b/libvpx/vp8/encoder/bitstream.h
index 2b196dcd2..ed45bff9e 100644
--- a/libvpx/vp8/encoder/bitstream.h
+++ b/libvpx/vp8/encoder/bitstream.h
@@ -15,7 +15,15 @@
 extern "C" {
 #endif
 
+#include "vp8/encoder/treewriter.h"
+#include "vp8/encoder/tokenize.h"
+
 void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount);
+void vp8_convert_rfct_to_prob(struct VP8_COMP *const cpi);
+void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
+                              int prob_last, int prob_garf);
+int vp8_estimate_entropy_savings(struct VP8_COMP *cpi);
+void vp8_update_coef_probs(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
index c7ad3bfe2..9bb0df72d 100644
--- a/libvpx/vp8/encoder/encodeframe.c
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -11,8 +11,12 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "bitstream.h"
 #include "encodemb.h"
 #include "encodemv.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
 #include "vp8/common/common.h"
 #include "onyx_int.h"
 #include "vp8/common/extend.h"
@@ -35,13 +39,6 @@
 #include "encodeframe.h"
 
 extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
-extern void vp8_calc_ref_frame_costs(int *ref_frame_cost, int prob_intra,
-                                     int prob_last, int prob_garf);
-extern void vp8_convert_rfct_to_prob(VP8_COMP *const cpi);
-extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
-extern void vp8_auto_select_speed(VP8_COMP *cpi);
-extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
-                                      MB_ROW_COMP *mbr_ei, int count);
 static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
 
 #ifdef MODE_STATS
@@ -344,11 +341,11 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
 
 #if CONFIG_MULTITHREAD
   const int nsync = cpi->mt_sync_range;
-  const int rightmost_col = cm->mb_cols + nsync;
-  const int *last_row_current_mb_col;
-  int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+  vpx_atomic_int rightmost_col = VPX_ATOMIC_INIT(cm->mb_cols + nsync);
+  const vpx_atomic_int *last_row_current_mb_col;
+  vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
-  if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0 && mb_row != 0) {
     last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1];
   } else {
     last_row_current_mb_col = &rightmost_col;
@@ -418,15 +415,13 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
     vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded != 0) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
       if (((mb_col - 1) % nsync) == 0) {
-        pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
-        protected_write(mutex, current_mb_col, mb_col - 1);
+        vpx_atomic_store_release(current_mb_col, mb_col - 1);
       }
 
       if (mb_row && !(mb_col & (nsync - 1))) {
-        pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
-        sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+        vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
       }
     }
 #endif
@@ -566,8 +561,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row,
                     xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded != 0) {
-    protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col);
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) != 0) {
+    vpx_atomic_store_release(current_mb_col,
+                             vpx_atomic_load_acquire(&rightmost_col));
   }
 #endif
 
@@ -752,13 +748,14 @@ void vp8_encode_frame(VP8_COMP *cpi) {
     vpx_usec_timer_start(&emr_timer);
 
 #if CONFIG_MULTITHREAD
-    if (cpi->b_multi_threaded) {
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
       int i;
 
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
-      for (i = 0; i < cm->mb_rows; ++i) cpi->mt_current_mb_col[i] = -1;
+      for (i = 0; i < cm->mb_rows; ++i)
+        vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
       for (i = 0; i < cpi->encoding_thread_count; ++i) {
         sem_post(&cpi->h_event_start_encoding[i]);
diff --git a/libvpx/vp8/encoder/encodeframe.h b/libvpx/vp8/encoder/encodeframe.h
index c1d863492..5274aba41 100644
--- a/libvpx/vp8/encoder/encodeframe.h
+++ b/libvpx/vp8/encoder/encodeframe.h
@@ -10,24 +10,29 @@
 #ifndef VP8_ENCODER_ENCODEFRAME_H_
 #define VP8_ENCODER_ENCODEFRAME_H_
 
+#include "vp8/encoder/tokenize.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
-extern void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x);
 
-extern void vp8_build_block_offsets(MACROBLOCK *x);
+struct VP8_COMP;
+struct macroblock;
+
+void vp8_activity_masking(struct VP8_COMP *cpi, MACROBLOCK *x);
+
+void vp8_build_block_offsets(struct macroblock *x);
 
-extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+void vp8_setup_block_ptrs(struct macroblock *x);
 
-extern void vp8_encode_frame(VP8_COMP *cpi);
+void vp8_encode_frame(struct VP8_COMP *cpi);
 
-extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t, int recon_yoffset,
-                                         int recon_uvoffset, int mb_row,
-                                         int mb_col);
+int vp8cx_encode_inter_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t, int recon_yoffset,
+                                  int recon_uvoffset, int mb_row, int mb_col);
 
-extern int vp8cx_encode_intra_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
-                                         TOKENEXTRA **t);
+int vp8cx_encode_intra_macroblock(struct VP8_COMP *cpi, struct macroblock *x,
+                                  TOKENEXTRA **t);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
index 36e9a9078..ea93ccd71 100644
--- a/libvpx/vp8/encoder/encodemv.c
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -25,14 +25,12 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
   const vp8_prob *p = mvc->prob;
   const int x = v < 0 ? -v : v;
 
-  if (x < mvnum_short) /* Small */
-  {
+  if (x < mvnum_short) { /* Small */
     vp8_write(w, 0, p[mvpis_short]);
     vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
 
     if (!x) return; /* no sign bit */
-  } else            /* Large */
-  {
+  } else {          /* Large */
     int i = 0;
 
     vp8_write(w, 1, p[mvpis_short]);
diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
index df34997ac..55a1528b1 100644
--- a/libvpx/vp8/encoder/ethreading.c
+++ b/libvpx/vp8/encoder/ethreading.c
@@ -14,6 +14,7 @@
 #include "vp8/common/extend.h"
 #include "bitstream.h"
 #include "encodeframe.h"
+#include "ethreading.h"
 
 #if CONFIG_MULTITHREAD
 
@@ -25,11 +26,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) {
   VP8_COMMON *cm = &cpi->common;
 
   while (1) {
-    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_lpf) == 0) {
       /* we're shutting down */
-      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       vp8_loopfilter_frame(cpi, cm);
 
@@ -47,7 +48,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
   ENTROPY_CONTEXT_PLANES mb_row_left_context;
 
   while (1) {
-    if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+    if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
     if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) {
       const int nsync = cpi->mt_sync_range;
@@ -65,7 +66,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
       int *totalrate = &mbri->totalrate;
 
       /* we're shutting down */
-      if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break;
+      if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) == 0) break;
 
       xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1);
       xd->mode_info_stride = cm->mode_info_stride;
@@ -79,8 +80,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
         int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
         int map_index = (mb_row * cm->mb_cols);
-        const int *last_row_current_mb_col;
-        int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
+        const vpx_atomic_int *last_row_current_mb_col;
+        vpx_atomic_int *current_mb_col = &cpi->mt_current_mb_col[mb_row];
 
 #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING)
         vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)];
@@ -107,13 +108,11 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         /* for each macroblock col in image */
         for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
           if (((mb_col - 1) % nsync) == 0) {
-            pthread_mutex_t *mutex = &cpi->pmutex[mb_row];
-            protected_write(mutex, current_mb_col, mb_col - 1);
+            vpx_atomic_store_release(current_mb_col, mb_col - 1);
           }
 
           if (mb_row && !(mb_col & (nsync - 1))) {
-            pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1];
-            sync_read(mutex, mb_col, last_row_current_mb_col, nsync);
+            vp8_atomic_spin_wait(mb_col, last_row_current_mb_col, nsync);
           }
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
@@ -285,7 +284,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) {
         vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16,
                           xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
 
-        protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync);
+        vpx_atomic_store_release(current_mb_col, mb_col + nsync);
 
         /* this is to account for the border */
         xd->mode_info_context++;
@@ -489,12 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
 
-  cpi->b_multi_threaded = 0;
+  vpx_atomic_init(&cpi->b_multi_threaded, 0);
   cpi->encoding_thread_count = 0;
   cpi->b_lpf_running = 0;
 
-  pthread_mutex_init(&cpi->mt_mutex, NULL);
-
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
     int ithread;
     int th_count = cpi->oxcf.multi_threaded - 1;
@@ -525,7 +522,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     CHECK_MEM_ERROR(cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
-    cpi->b_multi_threaded = 1;
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
     cpi->encoding_thread_count = th_count;
 
     /*
@@ -554,7 +551,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
     if (rc) {
       /* shutdown other threads */
-      protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+      vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
@@ -568,8 +565,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       vpx_free(cpi->mb_row_ei);
       vpx_free(cpi->en_thread_data);
 
-      pthread_mutex_destroy(&cpi->mt_mutex);
-
       return -1;
     }
 
@@ -584,7 +579,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       if (rc) {
         /* shutdown other threads */
-        protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+        vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
         for (--ithread; ithread >= 0; ithread--) {
           sem_post(&cpi->h_event_start_encoding[ithread]);
           sem_post(&cpi->h_event_end_encoding[ithread]);
@@ -602,8 +597,6 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
         vpx_free(cpi->mb_row_ei);
         vpx_free(cpi->en_thread_data);
 
-        pthread_mutex_destroy(&cpi->mt_mutex);
-
         return -2;
       }
     }
@@ -612,9 +605,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 }
 
 void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
-  if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* shutdown other threads */
-    protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0);
+    vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
     {
       int i;
 
@@ -642,6 +635,5 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
     vpx_free(cpi->mb_row_ei);
     vpx_free(cpi->en_thread_data);
   }
-  pthread_mutex_destroy(&cpi->mt_mutex);
 }
 #endif
diff --git a/libvpx/vp8/encoder/ethreading.h b/libvpx/vp8/encoder/ethreading.h
new file mode 100644
index 000000000..95bf73d18
--- /dev/null
+++ b/libvpx/vp8/encoder/ethreading.h
@@ -0,0 +1,32 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_ETHREADING_H_
+#define VP8_ENCODER_ETHREADING_H_
+
+#include "vp8/encoder/onyx_int.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct macroblock;
+
+void vp8cx_init_mbrthread_data(struct VP8_COMP *cpi, struct macroblock *x,
+                               MB_ROW_COMP *mbr_ei, int count);
+int vp8cx_create_encoder_threads(struct VP8_COMP *cpi);
+void vp8cx_remove_encoder_threads(struct VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_ETHREADING_H_
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index caf19059e..70f924341 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -1273,8 +1273,9 @@ void vp8_init_second_pass(VP8_COMP *cpi) {
    * sum duration is not. Its calculated based on the actual durations of
    * all frames from the first pass.
    */
-  vp8_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count /
-                             cpi->twopass.total_stats.duration);
+  vp8_new_framerate(cpi,
+                    10000000.0 * cpi->twopass.total_stats.count /
+                        cpi->twopass.total_stats.duration);
 
   cpi->output_framerate = cpi->framerate;
   cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration *
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index b4a49a3b1..970120f3b 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -34,22 +34,19 @@ int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight) {
    * NEAREST for subsequent blocks. The "Weight" parameter allows, to a
    * limited extent, for some account to be taken of these factors.
    */
-  const int mv_idx_row =
-      clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
-  const int mv_idx_col =
-      clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
-  return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * Weight) >> 7;
+  return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+           mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
+          Weight) >>
+         7;
 }
 
 static int mv_err_cost(int_mv *mv, int_mv *ref, int *mvcost[2],
                        int error_per_bit) {
   /* Ignore mv costing if mvcost is NULL */
   if (mvcost) {
-    const int mv_idx_row =
-        clamp((mv->as_mv.row - ref->as_mv.row) >> 1, 0, MVvals);
-    const int mv_idx_col =
-        clamp((mv->as_mv.col - ref->as_mv.col) >> 1, 0, MVvals);
-    return ((mvcost[0][mv_idx_row] + mvcost[1][mv_idx_col]) * error_per_bit +
+    return ((mvcost[0][(mv->as_mv.row - ref->as_mv.row) >> 1] +
+             mvcost[1][(mv->as_mv.col - ref->as_mv.col) >> 1]) *
+                error_per_bit +
             128) >>
            8;
   }
diff --git a/libvpx/vp8/encoder/mips/mmi/dct_mmi.c b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 000000000..1f60a692d
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,425 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+/* TRANSPOSE_4H: transpose 4x4 matrix.
+   Input: ftmp1,ftmp2,ftmp3,ftmp4
+   Output: ftmp1,ftmp2,ftmp3,ftmp4
+   Note: ftmp0 always be 0, ftmp5~9 used for temporary value.
+ */
+#define TRANSPOSE_4H                                         \
+  MMI_LI(%[tmp0], 0x93)                                      \
+  "mtc1       %[tmp0],    %[ftmp10]                    \n\t" \
+  "punpcklhw  %[ftmp5],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp5],   %[ftmp5],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp6],   %[ftmp1],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp2],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp6],   %[ftmp6],   %[ftmp9]         \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpcklhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp7],   %[ftmp7],   %[ftmp9]         \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp3],   %[ftmp0]         \n\t" \
+  "punpckhhw  %[ftmp9],   %[ftmp4],   %[ftmp0]         \n\t" \
+  "pshufh     %[ftmp9],   %[ftmp9],   %[ftmp10]        \n\t" \
+  "or         %[ftmp8],   %[ftmp8],   %[ftmp9]         \n\t" \
+  "punpcklwd  %[ftmp1],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpckhwd  %[ftmp2],   %[ftmp5],   %[ftmp7]         \n\t" \
+  "punpcklwd  %[ftmp3],   %[ftmp6],   %[ftmp8]         \n\t" \
+  "punpckhwd  %[ftmp4],   %[ftmp6],   %[ftmp8]         \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  uint64_t tmp[1];
+  int16_t *ip = input;
+
+#if _MIPS_SIM == _ABIO32
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f2");
+  register double ftmp2 asm("$f4");
+  register double ftmp3 asm("$f6");
+  register double ftmp4 asm("$f8");
+  register double ftmp5 asm("$f10");
+  register double ftmp6 asm("$f12");
+  register double ftmp7 asm("$f14");
+  register double ftmp8 asm("$f16");
+  register double ftmp9 asm("$f18");
+  register double ftmp10 asm("$f20");
+  register double ftmp11 asm("$f22");
+  register double ftmp12 asm("$f24");
+#else
+  register double ftmp0 asm("$f0");
+  register double ftmp1 asm("$f1");
+  register double ftmp2 asm("$f2");
+  register double ftmp3 asm("$f3");
+  register double ftmp4 asm("$f4");
+  register double ftmp5 asm("$f5");
+  register double ftmp6 asm("$f6");
+  register double ftmp7 asm("$f7");
+  register double ftmp8 asm("$f8");
+  register double ftmp9 asm("$f9");
+  register double ftmp10 asm("$f10");
+  register double ftmp11 asm("$f11");
+  register double ftmp12 asm("$f12");
+#endif  // _MIPS_SIM == _ABIO32
+
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_5352) = { 0x000014e8000014e8ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_2217) = { 0x000008a9000008a9ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_8) = { 0x0008000800080008ULL };
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],      %[ftmp0]         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                     \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                     \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    TRANSPOSE_4H
+
+    "ldc1       %[ftmp11],  %[ff_ph_8]                      \n\t"
+    // f1 + f4
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    // a1
+    "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    // f2 + f3
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    // b1
+    "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    // f2 - f3
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    // c1
+    "pmullh     %[ftmp7],   %[ftmp7],       %[ftmp11]       \n\t"
+    // f1 - f4
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+    // d1
+    "pmullh     %[ftmp8],   %[ftmp8],       %[ftmp11]       \n\t"
+    // op[0] = a1 + b1
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    // op[2] = a1 - b1
+    "psubh      %[ftmp3],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[1] = (c1 * 2217 + d1 * 5352 + 14500) >> 12
+    MMI_LI(%[tmp0], 0x0c)
+    "mtc1       %[tmp0],    %[ftmp11]                       \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_14500]                  \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+
+    // op[3] = (d1 * 2217 - c1 * 5352 + 7500) >> 12
+    "ldc1       %[ftmp12],  %[ff_pw_7500]                   \n\t"
+    "punpcklhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp5],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp9],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp6],   %[ftmp9],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp12]       \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp12]       \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]       \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]       \n\t"
+    "packsswh   %[ftmp4],   %[ftmp5],       %[ftmp6]        \n\t"
+    TRANSPOSE_4H
+
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp4]        \n\t"
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp3]        \n\t"
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp4]        \n\t"
+
+    "pcmpeqh    %[ftmp0],   %[ftmp8],       %[ftmp0]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_01]                     \n\t"
+    "paddh      %[ftmp0],   %[ftmp0],       %[ftmp9]        \n\t"
+
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]        \n\t"
+    "psubh      %[ftmp2],   %[ftmp5],       %[ftmp6]        \n\t"
+    "ldc1       %[ftmp9],   %[ff_ph_07]                     \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "paddh      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+    MMI_LI(%[tmp0], 0x04)
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "psrah      %[ftmp1],   %[ftmp1],       %[ftmp9]        \n\t"
+    "psrah      %[ftmp2],   %[ftmp2],       %[ftmp9]        \n\t"
+
+    MMI_LI(%[tmp0], 0x10)
+    "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+    "ldc1       %[ftmp12],  %[ff_pw_12000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp7],       %[ftmp8]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op1]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp3],   %[ftmp10],      %[ftmp11]       \n\t"
+    "paddh      %[ftmp3],   %[ftmp3],       %[ftmp0]        \n\t"
+
+    "ldc1       %[ftmp12],  %[ff_pw_51000]                  \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp10],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "punpckhhw  %[ftmp5],   %[ftmp8],       %[ftmp7]        \n\t"
+    "pmaddhw    %[ftmp11],  %[ftmp5],       %[ff_ph_op3]    \n\t"
+    "paddw      %[ftmp10],  %[ftmp10],      %[ftmp12]       \n\t"
+    "paddw      %[ftmp11],  %[ftmp11],      %[ftmp12]       \n\t"
+    "psraw      %[ftmp10],  %[ftmp10],      %[ftmp9]        \n\t"
+    "psraw      %[ftmp11],  %[ftmp11],      %[ftmp9]        \n\t"
+    "packsswh   %[ftmp4],   %[ftmp10],      %[ftmp11]       \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[output])                 \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[output])                 \n\t"
+    "gssdlc1    %[ftmp3],   0x0f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp3],   0x08(%[output])                 \n\t"
+    "gssdlc1    %[ftmp2],   0x17(%[output])                 \n\t"
+    "gssdrc1    %[ftmp2],   0x10(%[output])                 \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[output])                 \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[output])                 \n\t"
+
+    : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+      [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+      [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+      [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+      [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]), [ip]"+&r"(ip)
+    : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07),
+      [ff_ph_op1] "f"(ff_ph_op1), [ff_ph_op3] "f"(ff_ph_op3),
+      [ff_pw_14500] "m"(ff_pw_14500), [ff_pw_7500] "m"(ff_pw_7500),
+      [ff_pw_12000] "m"(ff_pw_12000), [ff_pw_51000] "m"(ff_pw_51000),
+      [ff_pw_5352]"m"(ff_pw_5352), [ff_pw_2217]"m"(ff_pw_2217),
+      [ff_ph_8]"m"(ff_ph_8), [pitch]"r"(pitch), [output] "r"(output)
+    : "memory"
+  );
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  vp8_short_fdct4x4_mmi(input, output, pitch);
+  vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+  double ftmp[13];
+  uint32_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
+  DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+
+  __asm__ volatile (
+    MMI_LI(%[tmp0], 0x02)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "gsldlc1    %[ftmp1],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp2],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp3],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp3],   0x00(%[ip])                         \n\t"
+    MMI_ADDU(%[ip], %[ip], %[pitch])
+    "gsldlc1    %[ftmp4],   0x07(%[ip])                         \n\t"
+    "gsldrc1    %[ftmp4],   0x00(%[ip])                         \n\t"
+    TRANSPOSE_4H
+
+    "psllh      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "psllh      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+    // a
+    "paddh      %[ftmp5],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d
+    "paddh      %[ftmp6],   %[ftmp2],       %[ftmp4]            \n\t"
+    // c
+    "psubh      %[ftmp7],   %[ftmp2],       %[ftmp4]            \n\t"
+    // b
+    "psubh      %[ftmp8],   %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a + d
+    "paddh      %[ftmp1],   %[ftmp5],       %[ftmp6]            \n\t"
+    // b + c
+    "paddh      %[ftmp2],   %[ftmp8],       %[ftmp7]            \n\t"
+    // b - c
+    "psubh      %[ftmp3],   %[ftmp8],       %[ftmp7]            \n\t"
+    // a - d
+    "psubh      %[ftmp4],   %[ftmp5],       %[ftmp6]            \n\t"
+
+    "pcmpeqh    %[ftmp6],   %[ftmp5],       %[ftmp0]            \n\t"
+    "paddh      %[ftmp6],   %[ftmp6],       %[ff_ph_01]         \n\t"
+    "paddh      %[ftmp1],   %[ftmp1],       %[ftmp6]            \n\t"
+    TRANSPOSE_4H
+
+    // op[2], op[0]
+    "pmaddhw    %[ftmp5],   %[ftmp1],       %[ff_pw_01]         \n\t"
+    // op[3], op[1]
+    "pmaddhw    %[ftmp1],   %[ftmp1],       %[ff_pw_mask]       \n\t"
+
+    // op[6], op[4]
+    "pmaddhw    %[ftmp6],   %[ftmp2],       %[ff_pw_01]         \n\t"
+    // op[7], op[5]
+    "pmaddhw    %[ftmp2],   %[ftmp2],       %[ff_pw_mask]       \n\t"
+
+    // op[10], op[8]
+    "pmaddhw    %[ftmp7],   %[ftmp3],       %[ff_pw_01]         \n\t"
+    // op[11], op[9]
+    "pmaddhw    %[ftmp3],   %[ftmp3],       %[ff_pw_mask]       \n\t"
+
+    // op[14], op[12]
+    "pmaddhw    %[ftmp8],   %[ftmp4],       %[ff_pw_01]         \n\t"
+    // op[15], op[13]
+    "pmaddhw    %[ftmp4],   %[ftmp4],       %[ff_pw_mask]       \n\t"
+
+    // a1, a3
+    "paddw      %[ftmp9],   %[ftmp5],       %[ftmp7]            \n\t"
+    // d1, d3
+    "paddw      %[ftmp10],  %[ftmp6],       %[ftmp8]            \n\t"
+    // c1, c3
+    "psubw      %[ftmp11],  %[ftmp6],       %[ftmp8]            \n\t"
+    // b1, b3
+    "psubw      %[ftmp12],  %[ftmp5],       %[ftmp7]            \n\t"
+
+    // a1 + d1, a3 + d3
+    "paddw      %[ftmp5],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b1 + c1, b3 + c3
+    "paddw      %[ftmp6],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b1 - c1, b3 - c3
+    "psubw      %[ftmp7],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a1 - d1, a3 - d3
+    "psubw      %[ftmp8],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    // a2, a4
+    "paddw      %[ftmp9],   %[ftmp1],       %[ftmp3]            \n\t"
+    // d2, d4
+    "paddw      %[ftmp10],  %[ftmp2],       %[ftmp4]            \n\t"
+    // c2, c4
+    "psubw      %[ftmp11],  %[ftmp2],       %[ftmp4]            \n\t"
+    // b2, b4
+    "psubw      %[ftmp12],  %[ftmp1],       %[ftmp3]            \n\t"
+
+    // a2 + d2, a4 + d4
+    "paddw      %[ftmp1],   %[ftmp9],       %[ftmp10]           \n\t"
+    // b2 + c2, b4 + c4
+    "paddw      %[ftmp2],   %[ftmp12],      %[ftmp11]           \n\t"
+    // b2 - c2, b4 - c4
+    "psubw      %[ftmp3],   %[ftmp12],      %[ftmp11]           \n\t"
+    // a2 - d2, a4 - d4
+    "psubw      %[ftmp4],   %[ftmp9],       %[ftmp10]           \n\t"
+
+    MMI_LI(%[tmp0], 0x03)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp1]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp2]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp2],   %[ftmp2],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp3]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp4]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp4],   %[ftmp4],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp5]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp5],   %[ftmp5],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp5],   %[ftmp5],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp6]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp6],   %[ftmp6],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp6],   %[ftmp6],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp7]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp7],   %[ftmp7],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp7],   %[ftmp7],       %[ftmp11]           \n\t"
+
+    "pcmpgtw    %[ftmp9],   %[ftmp0],       %[ftmp8]            \n\t"
+    "and        %[ftmp9],   %[ftmp9],       %[ff_pw_01]         \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t"
+    "paddw      %[ftmp8],   %[ftmp8],       %[ff_pw_03]         \n\t"
+    "psraw      %[ftmp8],   %[ftmp8],       %[ftmp11]           \n\t"
+
+    "packsswh   %[ftmp1],   %[ftmp1],       %[ftmp5]            \n\t"
+    "packsswh   %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+    "packsswh   %[ftmp3],   %[ftmp3],       %[ftmp7]            \n\t"
+    "packsswh   %[ftmp4],   %[ftmp4],       %[ftmp8]            \n\t"
+
+    MMI_LI(%[tmp0], 0x72)
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "pshufh     %[ftmp1],   %[ftmp1],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp2],   %[ftmp2],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp3],   %[ftmp3],       %[ftmp11]           \n\t"
+    "pshufh     %[ftmp4],   %[ftmp4],       %[ftmp11]           \n\t"
+
+    "gssdlc1    %[ftmp1],   0x07(%[op])                         \n\t"
+    "gssdrc1    %[ftmp1],   0x00(%[op])                         \n\t"
+    "gssdlc1    %[ftmp2],   0x0f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp2],   0x08(%[op])                         \n\t"
+    "gssdlc1    %[ftmp3],   0x17(%[op])                         \n\t"
+    "gssdrc1    %[ftmp3],   0x10(%[op])                         \n\t"
+    "gssdlc1    %[ftmp4],   0x1f(%[op])                         \n\t"
+    "gssdrc1    %[ftmp4],   0x18(%[op])                         \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),
+      [tmp0]"=&r"(tmp[0]),
+      [ip]"+&r"(input)
+    : [op]"r"(output),
+      [ff_pw_01]"f"(ff_pw_01),          [pitch]"r"((mips_reg)pitch),
+      [ff_pw_03]"f"(ff_pw_03),          [ff_pw_mask]"f"(ff_pw_mask),
+      [ff_ph_01]"f"(ff_ph_01)
+    : "memory"
+  );
+}
diff --git a/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
new file mode 100644
index 000000000..3ccb196ff
--- /dev/null
+++ b/libvpx/vp8/encoder/mips/mmi/vp8_quantize_mmi.c
@@ -0,0 +1,262 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/encoder/quantize.h"
+#include "vp8/common/quant_common.h"
+
+#define REGULAR_SELECT_EOB(i, rc)                                        \
+  z = coeff_ptr[rc];                                                     \
+  sz = (z >> 31);                                                        \
+  x = (z ^ sz) - sz;                                                     \
+  zbin = zbin_ptr[rc] + *(zbin_boost_ptr++) + zbin_oq_value;             \
+  if (x >= zbin) {                                                       \
+    x += round_ptr[rc];                                                  \
+    y = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16; \
+    if (y) {                                                             \
+      x = (y ^ sz) - sz;                                                 \
+      qcoeff_ptr[rc] = x;                                                \
+      dqcoeff_ptr[rc] = x * dequant_ptr[rc];                             \
+      eob = i;                                                           \
+      zbin_boost_ptr = b->zrun_zbin_boost;                               \
+    }                                                                    \
+  }
+
+void vp8_fast_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant_fast;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t *inv_zig_zag = vp8_default_inv_zig_zag;
+
+  double ftmp[13];
+  uint64_t tmp[1];
+  DECLARE_ALIGNED(8, const uint64_t, ones) = { 0xffffffffffffffffULL };
+  int eob = 0;
+
+  __asm__ volatile(
+      // loop 0 ~ 7
+      "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+      "gsldlc1    %[ftmp1],   0x07(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[coeff_ptr])              \n\t"
+      "li         %[tmp0],    0x0f                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x07(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x00(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x0f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x08(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x07(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x00(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x0f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x08(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x07(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x00(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x0f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x08(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x07(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x00(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x0f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x08(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x07(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x00(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x0f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x08(%[dqcoeff_ptr])            \n\t"
+
+      // loop 8 ~ 15
+      "gsldlc1    %[ftmp1],   0x17(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[coeff_ptr])              \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[coeff_ptr])              \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[coeff_ptr])              \n\t"
+
+      "psrah      %[ftmp3],   %[ftmp1],       %[ftmp9]        \n\t"
+      "xor        %[ftmp1],   %[ftmp3],       %[ftmp1]        \n\t"
+      "psubh      %[ftmp1],   %[ftmp1],       %[ftmp3]        \n\t"
+      "psrah      %[ftmp4],   %[ftmp2],       %[ftmp9]        \n\t"
+      "xor        %[ftmp2],   %[ftmp4],       %[ftmp2]        \n\t"
+      "psubh      %[ftmp2],   %[ftmp2],       %[ftmp4]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[round_ptr])              \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[round_ptr])              \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[round_ptr])              \n\t"
+      "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "paddh      %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "gsldlc1    %[ftmp7],   0x17(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp7],   0x10(%[quant_ptr])              \n\t"
+      "gsldlc1    %[ftmp8],   0x1f(%[quant_ptr])              \n\t"
+      "gsldrc1    %[ftmp8],   0x18(%[quant_ptr])              \n\t"
+      "pmulhuh    %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmulhuh    %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+
+      "xor        %[ftmp7],   %[ftmp5],       %[ftmp3]        \n\t"
+      "xor        %[ftmp8],   %[ftmp6],       %[ftmp4]        \n\t"
+      "psubh      %[ftmp7],   %[ftmp7],       %[ftmp3]        \n\t"
+      "psubh      %[ftmp8],   %[ftmp8],       %[ftmp4]        \n\t"
+      "gssdlc1    %[ftmp7],   0x17(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp7],   0x10(%[qcoeff_ptr])             \n\t"
+      "gssdlc1    %[ftmp8],   0x1f(%[qcoeff_ptr])             \n\t"
+      "gssdrc1    %[ftmp8],   0x18(%[qcoeff_ptr])             \n\t"
+
+      "gsldlc1    %[ftmp1],   0x17(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp1],   0x10(%[inv_zig_zag])            \n\t"
+      "gsldlc1    %[ftmp2],   0x1f(%[inv_zig_zag])            \n\t"
+      "gsldrc1    %[ftmp2],   0x18(%[inv_zig_zag])            \n\t"
+      "pcmpeqh    %[ftmp5],   %[ftmp5],       %[ftmp0]        \n\t"
+      "pcmpeqh    %[ftmp6],   %[ftmp6],       %[ftmp0]        \n\t"
+      "xor        %[ftmp5],   %[ftmp5],       %[ones]         \n\t"
+      "xor        %[ftmp6],   %[ftmp6],       %[ones]         \n\t"
+      "and        %[ftmp5],   %[ftmp5],       %[ftmp1]        \n\t"
+      "and        %[ftmp6],   %[ftmp6],       %[ftmp2]        \n\t"
+      "pmaxsh     %[ftmp11],  %[ftmp5],       %[ftmp6]        \n\t"
+
+      "gsldlc1    %[ftmp5],   0x17(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp5],   0x10(%[dequant_ptr])            \n\t"
+      "gsldlc1    %[ftmp6],   0x1f(%[dequant_ptr])            \n\t"
+      "gsldrc1    %[ftmp6],   0x18(%[dequant_ptr])            \n\t"
+      "pmullh     %[ftmp5],   %[ftmp5],       %[ftmp7]        \n\t"
+      "pmullh     %[ftmp6],   %[ftmp6],       %[ftmp8]        \n\t"
+      "gssdlc1    %[ftmp5],   0x17(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp5],   0x10(%[dqcoeff_ptr])            \n\t"
+      "gssdlc1    %[ftmp6],   0x1f(%[dqcoeff_ptr])            \n\t"
+      "gssdrc1    %[ftmp6],   0x18(%[dqcoeff_ptr])            \n\t"
+
+      "li         %[tmp0],    0x10                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "psrlw      %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xaa                            \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "pshufh     %[ftmp11],  %[ftmp10],       %[ftmp9]       \n\t"
+      "pmaxsh     %[ftmp10],  %[ftmp10],       %[ftmp11]      \n\t"
+      "li         %[tmp0],    0xffff                          \n\t"
+      "mtc1       %[tmp0],    %[ftmp9]                        \n\t"
+      "and        %[ftmp10],  %[ftmp10],       %[ftmp9]       \n\t"
+      "gssdlc1    %[ftmp10],  0x07(%[eob])                    \n\t"
+      "gssdrc1    %[ftmp10],  0x00(%[eob])                    \n\t"
+      : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+        [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+        [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+        [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+        [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0])
+      : [coeff_ptr] "r"((mips_reg)coeff_ptr),
+        [qcoeff_ptr] "r"((mips_reg)qcoeff_ptr),
+        [dequant_ptr] "r"((mips_reg)dequant_ptr),
+        [round_ptr] "r"((mips_reg)round_ptr),
+        [quant_ptr] "r"((mips_reg)quant_ptr),
+        [dqcoeff_ptr] "r"((mips_reg)dqcoeff_ptr),
+        [inv_zig_zag] "r"((mips_reg)inv_zig_zag), [eob] "r"((mips_reg)&eob),
+        [ones] "f"(ones)
+      : "memory");
+
+  *d->eob = eob;
+}
+
+void vp8_regular_quantize_b_mmi(BLOCK *b, BLOCKD *d) {
+  int eob = 0;
+  int x, y, z, sz, zbin;
+  const int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  const int16_t *coeff_ptr = b->coeff;
+  const int16_t *zbin_ptr = b->zbin;
+  const int16_t *round_ptr = b->round;
+  const int16_t *quant_ptr = b->quant;
+  const int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  const int16_t *dequant_ptr = d->dequant;
+  const int16_t zbin_oq_value = b->zbin_extra;
+  register double ftmp0 asm("$f0");
+
+  //  memset(qcoeff_ptr, 0, 32);
+  //  memset(dqcoeff_ptr, 0, 32);
+  /* clang-format off */
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]        \n\t"
+    "gssdlc1    %[ftmp0],   0x07(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[qcoeff_ptr])             \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[qcoeff_ptr])             \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[qcoeff_ptr])             \n\t"
+
+    "gssdlc1    %[ftmp0],   0x07(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x00(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x0f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x08(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x17(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x10(%[dqcoeff_ptr])            \n\t"
+    "gssdlc1    %[ftmp0],   0x1f(%[dqcoeff_ptr])            \n\t"
+    "gssdrc1    %[ftmp0],   0x18(%[dqcoeff_ptr])            \n\t"
+    : [ftmp0]"=&f"(ftmp0)
+    : [qcoeff_ptr]"r"(qcoeff_ptr), [dqcoeff_ptr]"r"(dqcoeff_ptr)
+    : "memory"
+  );
+  /* clang-format on */
+
+  REGULAR_SELECT_EOB(1, 0);
+  REGULAR_SELECT_EOB(2, 1);
+  REGULAR_SELECT_EOB(3, 4);
+  REGULAR_SELECT_EOB(4, 8);
+  REGULAR_SELECT_EOB(5, 5);
+  REGULAR_SELECT_EOB(6, 2);
+  REGULAR_SELECT_EOB(7, 3);
+  REGULAR_SELECT_EOB(8, 6);
+  REGULAR_SELECT_EOB(9, 9);
+  REGULAR_SELECT_EOB(10, 12);
+  REGULAR_SELECT_EOB(11, 13);
+  REGULAR_SELECT_EOB(12, 10);
+  REGULAR_SELECT_EOB(13, 7);
+  REGULAR_SELECT_EOB(14, 11);
+  REGULAR_SELECT_EOB(15, 14);
+  REGULAR_SELECT_EOB(16, 15);
+
+  *d->eob = (char)eob;
+}
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index b571d29d9..224318242 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -12,10 +12,12 @@
 #include "./vpx_scale_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vp8_rtcd.h"
+#include "bitstream.h"
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/blockd.h"
 #include "onyx_int.h"
 #include "vp8/common/systemdependent.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "vp8/encoder/quantize.h"
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
@@ -35,6 +37,7 @@
 #include "vp8/common/threading.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
 #if ARCH_ARM
 #include "vpx_ports/arm.h"
 #endif
@@ -42,6 +45,13 @@
 #include "mr_dissim.h"
 #endif
 #include "encodeframe.h"
+#if CONFIG_MULTITHREAD
+#include "ethreading.h"
+#endif
+#include "picklpf.h"
+#if !CONFIG_REALTIME_ONLY
+#include "temporal_filter.h"
+#endif
 
 #include <assert.h>
 #include <math.h>
@@ -50,28 +60,17 @@
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 extern int vp8_update_coef_context(VP8_COMP *cpi);
-extern void vp8_update_coef_probs(VP8_COMP *cpi);
 #endif
 
-extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
-extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
-
 extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source,
                               YV12_BUFFER_CONFIG *post, int filt_lvl,
                               int low_var_thresh, int flag);
 extern void print_parms(VP8_CONFIG *ocf, char *filenam);
 extern unsigned int vp8_get_processor_freq();
 extern void print_tree_update_probs();
-extern int vp8cx_create_encoder_threads(VP8_COMP *cpi);
-extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
-
-int vp8_estimate_entropy_savings(VP8_COMP *cpi);
 
 int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest);
 
-extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance);
-
 static void set_default_lf_deltas(VP8_COMP *cpi);
 
 extern const int vp8_gf_interval_table[101];
@@ -87,6 +86,9 @@ FILE *yuv_file;
 #ifdef OUTPUT_YUV_DENOISED
 FILE *yuv_denoised_file;
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+static FILE *yuv_skinmap_file = NULL;
+#endif
 
 #if 0
 FILE *framepsnr;
@@ -219,7 +221,8 @@ static void save_layer_context(VP8_COMP *cpi) {
   lc->inter_frame_target = cpi->inter_frame_target;
   lc->total_byte_count = cpi->total_byte_count;
   lc->filter_level = cpi->common.filter_level;
-
+  lc->frames_since_last_drop_overshoot = cpi->frames_since_last_drop_overshoot;
+  lc->force_maxqp = cpi->force_maxqp;
   lc->last_frame_percent_intra = cpi->last_frame_percent_intra;
 
   memcpy(lc->count_mb_ref_frame_usage, cpi->mb.count_mb_ref_frame_usage,
@@ -255,7 +258,8 @@ static void restore_layer_context(VP8_COMP *cpi, const int layer) {
   cpi->inter_frame_target = lc->inter_frame_target;
   cpi->total_byte_count = lc->total_byte_count;
   cpi->common.filter_level = lc->filter_level;
-
+  cpi->frames_since_last_drop_overshoot = lc->frames_since_last_drop_overshoot;
+  cpi->force_maxqp = lc->force_maxqp;
   cpi->last_frame_percent_intra = lc->last_frame_percent_intra;
 
   memcpy(cpi->mb.count_mb_ref_frame_usage, lc->count_mb_ref_frame_usage,
@@ -447,18 +451,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
   cpi->mb.pip = 0;
 
 #if CONFIG_MULTITHREAD
-  /* De-allocate mutex */
-  if (cpi->pmutex != NULL) {
-    VP8_COMMON *const pc = &cpi->common;
-    int i;
-
-    for (i = 0; i < pc->mb_rows; ++i) {
-      pthread_mutex_destroy(&cpi->pmutex[i]);
-    }
-    vpx_free(cpi->pmutex);
-    cpi->pmutex = NULL;
-  }
-
   vpx_free(cpi->mt_current_mb_col);
   cpi->mt_current_mb_col = NULL;
 #endif
@@ -616,6 +608,59 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) {
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 }
 
+static void compute_skin_map(VP8_COMP *cpi) {
+  int mb_row, mb_col, num_bl;
+  VP8_COMMON *cm = &cpi->common;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+
+  const SKIN_DETECTION_BLOCK_SIZE bsize =
+      (cm->Width * cm->Height <= 352 * 288) ? SKIN_8X8 : SKIN_16X16;
+
+  for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
+    num_bl = 0;
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      cpi->skin_map[bl_index] =
+          vp8_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, cpi->consec_zero_last[bl_index], 0);
+      num_bl++;
+      src_y += 16;
+      src_u += 8;
+      src_v += 8;
+    }
+    src_y += (src_ystride << 4) - (num_bl << 4);
+    src_u += (src_uvstride << 3) - (num_bl << 3);
+    src_v += (src_uvstride << 3) - (num_bl << 3);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin). Skip the boundary.
+  for (mb_row = 1; mb_row < cm->mb_rows - 1; mb_row++) {
+    for (mb_col = 1; mb_col < cm->mb_cols - 1; mb_col++) {
+      const int bl_index = mb_row * cm->mb_cols + mb_col;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+
+      for (mi = -1; mi <= 1; mi += 1) {
+        for (mj = -1; mj <= 1; mj += 1) {
+          int bl_neighbor_index = (mb_row + mi) * cm->mb_cols + mb_col + mj;
+          if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
+  }
+}
+
 static void set_default_lf_deltas(VP8_COMP *cpi) {
   cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
@@ -1096,9 +1141,6 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 
   int width = cm->Width;
   int height = cm->Height;
-#if CONFIG_MULTITHREAD
-  int prev_mb_rows = cm->mb_rows;
-#endif
 
   if (vp8_alloc_frame_buffers(cm, width, height)) {
     vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
@@ -1190,26 +1232,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   if (cpi->oxcf.multi_threaded > 1) {
     int i;
 
-    /* De-allocate and re-allocate mutex */
-    if (cpi->pmutex != NULL) {
-      for (i = 0; i < prev_mb_rows; ++i) {
-        pthread_mutex_destroy(&cpi->pmutex[i]);
-      }
-      vpx_free(cpi->pmutex);
-      cpi->pmutex = NULL;
-    }
-
-    CHECK_MEM_ERROR(cpi->pmutex,
-                    vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows));
-    if (cpi->pmutex) {
-      for (i = 0; i < cm->mb_rows; ++i) {
-        pthread_mutex_init(&cpi->pmutex[i], NULL);
-      }
-    }
-
     vpx_free(cpi->mt_current_mb_col);
     CHECK_MEM_ERROR(cpi->mt_current_mb_col,
                     vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+    for (i = 0; i < cm->mb_rows; ++i)
+      vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
   }
 
 #endif
@@ -1526,9 +1553,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   setup_features(cpi);
 
-  {
+  if (!cpi->use_roi_static_threshold) {
     int i;
-
     for (i = 0; i < MAX_MB_SEGMENTS; ++i) {
       cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
     }
@@ -1788,6 +1814,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->active_map_enabled = 0;
 
+  cpi->use_roi_static_threshold = 0;
+
 #if 0
     /* Experimental code for lagged and one pass */
     /* Initialise one_pass GF frames stats */
@@ -1857,6 +1885,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
+  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
+                                            sizeof(cpi->skin_map[0])));
+
   CHECK_MEM_ERROR(cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
   CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
@@ -1880,6 +1911,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->common.refresh_alt_ref_frame = 0;
 
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -1933,6 +1965,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 #ifdef OUTPUT_YUV_DENOISED
   yuv_denoised_file = fopen("denoised.yuv", "ab");
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
+#endif
 
 #if 0
     framepsnr = fopen("framepsnr.stt", "a");
@@ -2284,6 +2319,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
   dealloc_compressor_data(cpi);
   vpx_free(cpi->mb.ss);
   vpx_free(cpi->tok);
+  vpx_free(cpi->skin_map);
   vpx_free(cpi->cyclic_refresh_map);
   vpx_free(cpi->consec_zero_last);
   vpx_free(cpi->consec_zero_last_mvbias);
@@ -2298,6 +2334,9 @@ void vp8_remove_compressor(VP8_COMP **ptr) {
 #ifdef OUTPUT_YUV_DENOISED
   fclose(yuv_denoised_file);
 #endif
+#ifdef OUTPUT_YUV_SKINMAP
+  fclose(yuv_skinmap_file);
+#endif
 
 #if 0
 
@@ -2474,34 +2513,6 @@ int vp8_update_entropy(VP8_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED)
-void vp8_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
-  unsigned char *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
 
@@ -2914,8 +2925,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
 
     cpi->current_ref_frames[GOLDEN_FRAME] = cm->current_video_frame;
     cpi->current_ref_frames[ALTREF_FRAME] = cm->current_video_frame;
-  } else /* For non key frames */
-  {
+  } else {
     if (cm->refresh_alt_ref_frame) {
       assert(!cm->copy_buffer_to_arf);
 
@@ -2936,8 +2946,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[ALTREF_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_arf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->gld_fb_idx].flags |= VP8_ALTR_FRAME;
           yv12_fb[cm->alt_fb_idx].flags &= ~VP8_ALTR_FRAME;
@@ -2969,8 +2978,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
           cpi->current_ref_frames[GOLDEN_FRAME] =
               cpi->current_ref_frames[LAST_FRAME];
         }
-      } else /* if (cm->copy_buffer_to_gf == 2) */
-      {
+      } else {
         if (cm->alt_fb_idx != cm->gld_fb_idx) {
           yv12_fb[cm->alt_fb_idx].flags |= VP8_GOLD_FRAME;
           yv12_fb[cm->gld_fb_idx].flags &= ~VP8_GOLD_FRAME;
@@ -3001,8 +3009,7 @@ static void update_reference_frames(VP8_COMP *cpi) {
       int i;
       for (i = LAST_FRAME; i < MAX_REF_FRAMES; ++i)
         vp8_yv12_copy_frame(cpi->Source, &cpi->denoiser.yv12_running_avg[i]);
-    } else /* For non key frames */
-    {
+    } else {
       vp8_yv12_extend_frame_borders(
           &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 
@@ -3234,7 +3241,7 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
   }
 
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     sem_post(&cpi->h_event_end_lpf); /* signal that we have set filter_level */
   }
 #endif
@@ -3788,6 +3795,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+  compute_skin_map(cpi);
+
   /* Setup background Q adjustment for error resilient mode.
    * For multi-layer encodes only enable this for the base layer.
   */
@@ -3861,7 +3870,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
 #ifdef OUTPUT_YUV_SRC
-  vp8_write_yuv_frame(yuv_file, cpi->Source);
+  vpx_write_yuv_frame(yuv_file, cpi->Source);
 #endif
 
   do {
@@ -3989,7 +3998,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #else
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
-    if (cpi->oxcf.screen_content_mode == 2) {
+
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) return;
     }
 
@@ -4421,11 +4431,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
+#ifdef OUTPUT_YUV_SKINMAP
+  if (cpi->common.current_video_frame > 1) {
+    vp8_compute_skin_map(cpi, yuv_skinmap_file);
+  }
+#endif
+
 #if CONFIG_MULTITHREAD
-  if (cpi->b_multi_threaded) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded)) {
     /* start loopfilter in separate thread */
     sem_post(&cpi->h_event_start_lpf);
     cpi->b_lpf_running = 1;
+    /* wait for the filter_level to be picked so that we can continue with
+     * stream packing */
+    sem_wait(&cpi->h_event_end_lpf);
   } else
 #endif
   {
@@ -4435,7 +4454,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   update_reference_frames(cpi);
 
 #ifdef OUTPUT_YUV_DENOISED
-  vp8_write_yuv_frame(yuv_denoised_file,
+  vpx_write_yuv_frame(yuv_denoised_file,
                       &cpi->denoiser.yv12_running_avg[INTRA_FRAME]);
 #endif
 
@@ -4445,12 +4464,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   }
 #endif
 
-#if CONFIG_MULTITHREAD
-  /* wait that filter_level is picked so that we can continue with stream
-   * packing */
-  if (cpi->b_multi_threaded) sem_wait(&cpi->h_event_end_lpf);
-#endif
-
   /* build the bitstream */
   vp8_pack_bitstream(cpi, dest, dest_end, size);
 
@@ -4784,7 +4797,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 #endif
 
   /* DEBUG */
-  /* vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
+  /* vpx_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show); */
 }
 #if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP8_COMP *cpi, size_t *size, unsigned char *dest,
@@ -5292,7 +5305,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_MULTITHREAD
   /* wait for the lpf thread done */
-  if (cpi->b_multi_threaded && cpi->b_lpf_running) {
+  if (vpx_atomic_load_acquire(&cpi->b_multi_threaded) && cpi->b_lpf_running) {
     sem_wait(&cpi->h_event_end_lpf);
     cpi->b_lpf_running = 0;
   }
@@ -5338,9 +5351,6 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   const int range = 63;
   int i;
 
-  // This method is currently incompatible with the cyclic refresh method
-  if (cpi->cyclic_refresh_mode_enabled) return -1;
-
   // Check number of rows and columns match
   if (cpi->common.mb_rows != (int)rows || cpi->common.mb_cols != (int)cols) {
     return -1;
@@ -5359,7 +5369,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
     return -1;
   }
 
-  if (!map) {
+  // Also disable segmentation if no deltas are specified.
+  if (!map || (delta_q[0] == 0 && delta_q[1] == 0 && delta_q[2] == 0 &&
+               delta_q[3] == 0 && delta_lf[0] == 0 && delta_lf[1] == 0 &&
+               delta_lf[2] == 0 && delta_lf[3] == 0 && threshold[0] == 0 &&
+               threshold[1] == 0 && threshold[2] == 0 && threshold[3] == 0)) {
     disable_segmentation(cpi);
     return 0;
   }
@@ -5396,6 +5410,11 @@ int vp8_set_roimap(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   /* Initialise the feature data structure */
   set_segment_data(cpi, &feature_data[0][0], SEGMENT_DELTADATA);
 
+  if (threshold[0] != 0 || threshold[1] != 0 || threshold[2] != 0 ||
+      threshold[3] != 0)
+    cpi->use_roi_static_threshold = 1;
+  cpi->cyclic_refresh_mode_enabled = 0;
+
   return 0;
 }
 
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index fe775064a..c489b46c2 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -249,6 +249,10 @@ typedef struct {
 
   int filter_level;
 
+  int frames_since_last_drop_overshoot;
+
+  int force_maxqp;
+
   int last_frame_percent_intra;
 
   int count_mb_ref_frame_usage[MAX_REF_FRAMES];
@@ -471,6 +475,8 @@ typedef struct VP8_COMP {
   int zeromv_count;
   int lf_zeromv_pct;
 
+  unsigned char *skin_map;
+
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
   int segment_encode_breakout[MAX_MB_SEGMENTS];
@@ -503,6 +509,7 @@ typedef struct VP8_COMP {
   int mse_source_denoised;
 
   int force_maxqp;
+  int frames_since_last_drop_overshoot;
 
   // GF update for 1 pass cbr.
   int gf_update_onepass_cbr;
@@ -511,11 +518,9 @@ typedef struct VP8_COMP {
 
 #if CONFIG_MULTITHREAD
   /* multithread data */
-  pthread_mutex_t *pmutex;
-  pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */
-  int *mt_current_mb_col;
+  vpx_atomic_int *mt_current_mb_col;
   int mt_sync_range;
-  int b_multi_threaded;
+  vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
   int b_lpf_running;
 
@@ -687,6 +692,9 @@ typedef struct VP8_COMP {
     int token_costs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS]
                    [MAX_ENTROPY_TOKENS];
   } rd_costs;
+
+  // Use the static threshold from ROI settings.
+  int use_roi_static_threshold;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
diff --git a/libvpx/vp8/encoder/pickinter.c b/libvpx/vp8/encoder/pickinter.c
index eb713f11c..a9943eb6a 100644
--- a/libvpx/vp8/encoder/pickinter.c
+++ b/libvpx/vp8/encoder/pickinter.c
@@ -25,6 +25,7 @@
 #include "vp8/common/reconintra4x4.h"
 #include "vpx_dsp/variance.h"
 #include "mcomp.h"
+#include "vp8/common/vp8_skin_detection.h"
 #include "rdopt.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
@@ -36,82 +37,9 @@
 extern unsigned int cnt_pm;
 #endif
 
-#define MODEL_MODE 1
-
 extern const int vp8_ref_frame_order[MAX_MODES];
 extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
 
-// Fixed point implementation of a skin color classifier. Skin color
-// is model by a Gaussian distribution in the CbCr color space.
-// See ../../test/skin_color_detector_test.cc where the reference
-// skin color classifier is defined.
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-// Checks if the input yCbCr values corresponds to skin color.
-static int is_skin_color(int y, int cb, int cr, int consec_zeromv) {
-  if (y < 40 || y > 220) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // No skin if block has been zero motion for long consecutive time.
-      if (consec_zeromv > 60) return 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; ++i) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
-            return 0;
-          } else if (consec_zeromv > 25 &&
-                     skin_color_diff > (skin_threshold[i + 1] >> 1)) {
-            return 0;
-          } else {
-            return 1;
-          }
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 static int macroblock_corner_grad(unsigned char *signal, int stride,
                                   int offsetx, int offsety, int sgnx,
                                   int sgny) {
@@ -760,27 +688,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #endif
 
   // Check if current macroblock is in skin area.
-  {
-    const int y = (x->src.y_buffer[7 * x->src.y_stride + 7] +
-                   x->src.y_buffer[7 * x->src.y_stride + 8] +
-                   x->src.y_buffer[8 * x->src.y_stride + 7] +
-                   x->src.y_buffer[8 * x->src.y_stride + 8]) >>
-                  2;
-    const int cb = (x->src.u_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.u_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    const int cr = (x->src.v_buffer[3 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[3 * x->src.uv_stride + 4] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 3] +
-                    x->src.v_buffer[4 * x->src.uv_stride + 4]) >>
-                   2;
-    x->is_skin = 0;
-    if (!cpi->oxcf.screen_content_mode) {
-      int block_index = mb_row * cpi->common.mb_cols + mb_col;
-      x->is_skin = is_skin_color(y, cb, cr, cpi->consec_zero_last[block_index]);
-    }
+  x->is_skin = 0;
+  if (!cpi->oxcf.screen_content_mode) {
+    int block_index = mb_row * cpi->common.mb_cols + mb_col;
+    x->is_skin = cpi->skin_map[block_index];
   }
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity) {
diff --git a/libvpx/vp8/encoder/picklpf.c b/libvpx/vp8/encoder/picklpf.c
index 6f287322e..b1b712db9 100644
--- a/libvpx/vp8/encoder/picklpf.c
+++ b/libvpx/vp8/encoder/picklpf.c
@@ -12,6 +12,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
+#include "vp8/encoder/picklpf.h"
 #include "vp8/encoder/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/vpx_scale.h"
diff --git a/libvpx/vp8/encoder/picklpf.h b/libvpx/vp8/encoder/picklpf.h
new file mode 100644
index 000000000..e6ad0dbf2
--- /dev/null
+++ b/libvpx/vp8/encoder/picklpf.h
@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_PICKLPF_H_
+#define VP8_ENCODER_PICKLPF_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+struct yv12_buffer_config;
+
+void vp8cx_pick_filter_level_fast(struct yv12_buffer_config *sd,
+                                  struct VP8_COMP *cpi);
+void vp8cx_set_alt_lf_level(struct VP8_COMP *cpi, int filt_val);
+void vp8cx_pick_filter_level(struct yv12_buffer_config *sd, VP8_COMP *cpi);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_PICKLPF_H_
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index e89247ae4..e58c31098 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -498,11 +498,9 @@ static void calc_gf_params(VP8_COMP *cpi) {
    * This is updated once the real frame size/boost is known.
    */
   if (cpi->oxcf.fixed_q == -1) {
-    if (cpi->pass == 2) /* 2 Pass */
-    {
+    if (cpi->pass == 2) { /* 2 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
-    } else /* 1 Pass */
-    {
+    } else { /* 1 Pass */
       cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
 
       if (cpi->last_boost > 750) cpi->frames_till_gf_update_due++;
@@ -1442,12 +1440,33 @@ int vp8_pick_frame_size(VP8_COMP *cpi) {
 // If this just encoded frame (mcomp/transform/quant, but before loopfilter and
 // pack_bitstream) has large overshoot, and was not being encoded close to the
 // max QP, then drop this frame and force next frame to be encoded at max QP.
-// Condition this on 1 pass CBR with screen content mode and frame dropper off.
+// Allow this for screen_content_mode = 2, or if drop frames is allowed.
 // TODO(marpan): Should do this exit condition during the encode_frame
 // (i.e., halfway during the encoding of the frame) to save cycles.
 int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
-  if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
-      cpi->drop_frames_allowed == 0 && cpi->common.frame_type != KEY_FRAME) {
+  int force_drop_overshoot = 0;
+#if CONFIG_MULTI_RES_ENCODING
+  // Only check for dropping due to overshoot on the lowest stream.
+  // If the lowest stream of the multi-res encoding was dropped due to
+  // overshoot, then force dropping on all upper layer streams
+  // (mr_encoder_id > 0).
+  LOWER_RES_FRAME_INFO *low_res_frame_info =
+      (LOWER_RES_FRAME_INFO *)cpi->oxcf.mr_low_res_mode_info;
+  if (cpi->oxcf.mr_total_resolutions > 1 && cpi->oxcf.mr_encoder_id > 0) {
+    force_drop_overshoot = low_res_frame_info->is_frame_dropped_overshoot_maxqp;
+    if (!force_drop_overshoot) {
+      cpi->force_maxqp = 0;
+      cpi->frames_since_last_drop_overshoot++;
+      return 0;
+    }
+  }
+#endif
+  if (cpi->common.frame_type != KEY_FRAME &&
+      (cpi->oxcf.screen_content_mode == 2 ||
+       (cpi->drop_frames_allowed &&
+        (force_drop_overshoot ||
+         (cpi->rate_correction_factor < (4.0f * MIN_BPB_FACTOR) &&
+          cpi->frames_since_last_drop_overshoot > (int)cpi->framerate))))) {
     // Note: the "projected_frame_size" from encode_frame() only gives estimate
     // of mode/motion vector rate (in non-rd mode): so below we only require
     // that projected_frame_size is somewhat greater than per-frame-bandwidth,
@@ -1458,17 +1477,20 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
     // Rate threshold, in bytes.
     int thresh_rate = 2 * (cpi->av_per_frame_bandwidth >> 3);
     // Threshold for the average (over all macroblocks) of the pixel-sum
-    // residual error over 16x16 block. Should add QP dependence on threshold?
-    int thresh_pred_err_mb = (256 << 4);
+    // residual error over 16x16 block.
+    int thresh_pred_err_mb = (200 << 4);
     int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
-    if (Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
-        pred_err_mb > thresh_pred_err_mb) {
+    // Reduce/ignore thresh_rate if pred_err_mb much larger than its threshold,
+    // give more weight to pred_err metric for overshoot detection.
+    if (cpi->drop_frames_allowed && pred_err_mb > (thresh_pred_err_mb << 4))
+      thresh_rate = thresh_rate >> 3;
+    if ((Q < thresh_qp && cpi->projected_frame_size > thresh_rate &&
+         pred_err_mb > thresh_pred_err_mb) ||
+        force_drop_overshoot) {
+      unsigned int i;
       double new_correction_factor;
-      const int target_size = cpi->av_per_frame_bandwidth;
       int target_bits_per_mb;
-      // Drop this frame: advance frame counters, and set force_maxqp flag.
-      cpi->common.current_video_frame++;
-      cpi->frames_since_key++;
+      const int target_size = cpi->av_per_frame_bandwidth;
       // Flag to indicate we will force next frame to be encoded at max QP.
       cpi->force_maxqp = 1;
       // Reset the buffer levels.
@@ -1499,14 +1521,40 @@ int vp8_drop_encodedframe_overshoot(VP8_COMP *cpi, int Q) {
       if (cpi->rate_correction_factor > MAX_BPB_FACTOR) {
         cpi->rate_correction_factor = MAX_BPB_FACTOR;
       }
+      // Drop this frame: update frame counters.
+      cpi->common.current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->temporal_pattern_counter++;
+      cpi->frames_since_last_drop_overshoot = 0;
+      if (cpi->oxcf.number_of_layers > 1) {
+        // Set max_qp and rate correction for all temporal layers if overshoot
+        // is detected.
+        for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->force_maxqp = 1;
+          lc->frames_since_last_drop_overshoot = 0;
+          lc->rate_correction_factor = cpi->rate_correction_factor;
+        }
+      }
+#if CONFIG_MULTI_RES_ENCODING
+      if (cpi->oxcf.mr_total_resolutions > 1)
+        low_res_frame_info->is_frame_dropped_overshoot_maxqp = 1;
+#endif
       return 1;
-    } else {
-      cpi->force_maxqp = 0;
-      return 0;
     }
     cpi->force_maxqp = 0;
+    cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+    if (cpi->oxcf.mr_total_resolutions > 1)
+      low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
     return 0;
   }
   cpi->force_maxqp = 0;
+  cpi->frames_since_last_drop_overshoot++;
+#if CONFIG_MULTI_RES_ENCODING
+  if (cpi->oxcf.mr_total_resolutions > 1)
+    low_res_frame_info->is_frame_dropped_overshoot_maxqp = 0;
+#endif
   return 0;
 }
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 3792b10f8..e210b4410 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -16,12 +16,14 @@
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
+#include "encodeframe.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
 #include "pickinter.h"
+#include "vp8/common/common.h"
 #include "vp8/common/entropymode.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra.h"
@@ -852,8 +854,7 @@ static int labels2mode(MACROBLOCK *x, int const *labelings, int which_label,
         default: break;
       }
 
-      if (m == ABOVE4X4) /* replace above with left if same */
-      {
+      if (m == ABOVE4X4) { /* replace above with left if same */
         int_mv left_mv;
 
         left_mv.as_int = col ? d[-1].bmi.mv.as_int : left_block_mv(mic, i);
@@ -959,19 +960,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
   vp8_variance_fn_ptr_t *v_fn_ptr;
 
   ENTROPY_CONTEXT_PLANES t_above, t_left;
-  ENTROPY_CONTEXT *ta;
-  ENTROPY_CONTEXT *tl;
   ENTROPY_CONTEXT_PLANES t_above_b, t_left_b;
-  ENTROPY_CONTEXT *ta_b;
-  ENTROPY_CONTEXT *tl_b;
 
   memcpy(&t_above, x->e_mbd.above_context, sizeof(ENTROPY_CONTEXT_PLANES));
   memcpy(&t_left, x->e_mbd.left_context, sizeof(ENTROPY_CONTEXT_PLANES));
 
-  ta = (ENTROPY_CONTEXT *)&t_above;
-  tl = (ENTROPY_CONTEXT *)&t_left;
-  ta_b = (ENTROPY_CONTEXT *)&t_above_b;
-  tl_b = (ENTROPY_CONTEXT *)&t_left_b;
+  vp8_zero(t_above_b);
+  vp8_zero(t_left_b);
 
   br = 0;
   bd = 0;
@@ -1151,13 +1146,13 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         mode_selected = this_mode;
         best_label_rd = this_rd;
 
-        memcpy(ta_b, ta_s, sizeof(ENTROPY_CONTEXT_PLANES));
-        memcpy(tl_b, tl_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_above_b, &t_above_s, sizeof(ENTROPY_CONTEXT_PLANES));
+        memcpy(&t_left_b, &t_left_s, sizeof(ENTROPY_CONTEXT_PLANES));
       }
     } /*for each 4x4 mode*/
 
-    memcpy(ta, ta_b, sizeof(ENTROPY_CONTEXT_PLANES));
-    memcpy(tl, tl_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_above, &t_above_b, sizeof(ENTROPY_CONTEXT_PLANES));
+    memcpy(&t_left, &t_left_b, sizeof(ENTROPY_CONTEXT_PLANES));
 
     labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected],
                 bsi->ref_mv, x->mvcost);
diff --git a/libvpx/vp8/encoder/rdopt.h b/libvpx/vp8/encoder/rdopt.h
index 8186ff105..960bd8f1c 100644
--- a/libvpx/vp8/encoder/rdopt.h
+++ b/libvpx/vp8/encoder/rdopt.h
@@ -19,6 +19,9 @@ extern "C" {
 
 #define RDCOST(RM, DM, R, D) (((128 + (R) * (RM)) >> 8) + (DM) * (D))
 
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+void vp8_auto_select_speed(VP8_COMP *cpi);
+
 static INLINE void insertsortmv(int arr[], int len) {
   int i, j, k;
 
diff --git a/libvpx/vp8/encoder/temporal_filter.c b/libvpx/vp8/encoder/temporal_filter.c
index 1b2f46bb6..0a7d25fb0 100644
--- a/libvpx/vp8/encoder/temporal_filter.c
+++ b/libvpx/vp8/encoder/temporal_filter.c
@@ -20,6 +20,7 @@
 #include "ratectrl.h"
 #include "vp8/common/quant_common.h"
 #include "segmentation.h"
+#include "temporal_filter.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
diff --git a/libvpx/vp8/encoder/temporal_filter.h b/libvpx/vp8/encoder/temporal_filter.h
new file mode 100644
index 000000000..865d909fb
--- /dev/null
+++ b/libvpx/vp8/encoder/temporal_filter.h
@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_ENCODER_TEMPORAL_FILTER_H_
+#define VP8_ENCODER_TEMPORAL_FILTER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8_COMP;
+
+void vp8_temporal_filter_prepare_c(struct VP8_COMP *cpi, int distance);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP8_ENCODER_TEMPORAL_FILTER_H_
diff --git a/libvpx/vp8/encoder/x86/dct_sse2.asm b/libvpx/vp8/encoder/x86/dct_sse2.asm
index d06bca592..4d92f0341 100644
--- a/libvpx/vp8/encoder/x86/dct_sse2.asm
+++ b/libvpx/vp8/encoder/x86/dct_sse2.asm
@@ -60,6 +60,8 @@
     ret
 %endmacro
 
+SECTION .text
+
 ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch)
 global sym(vp8_short_fdct4x4_sse2) PRIVATE
 sym(vp8_short_fdct4x4_sse2):
diff --git a/libvpx/vp8/encoder/x86/encodeopt.asm b/libvpx/vp8/encoder/x86/encodeopt.asm
index 0297220ee..f6c6aeae7 100644
--- a/libvpx/vp8/encoder/x86/encodeopt.asm
+++ b/libvpx/vp8/encoder/x86/encodeopt.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;int vp8_block_error_sse2(short *coeff_ptr,  short *dcoef_ptr)
 global sym(vp8_block_error_sse2) PRIVATE
 sym(vp8_block_error_sse2):
diff --git a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
index f4989279f..b5d5de4a5 100644
--- a/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
+++ b/libvpx/vp8/encoder/x86/fwalsh_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
 global sym(vp8_short_walsh4x4_sse2) PRIVATE
 sym(vp8_short_walsh4x4_sse2):
diff --git a/libvpx/vp8/encoder/x86/quantize_mmx.asm b/libvpx/vp8/encoder/x86/quantize_mmx.asm
deleted file mode 100644
index 2864ce16d..000000000
--- a/libvpx/vp8/encoder/x86/quantize_mmx.asm
+++ /dev/null
@@ -1,286 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-;                           short *qcoeff_ptr,short *dequant_ptr,
-;                           short *scan_mask, short *round_ptr,
-;                           short *quant_ptr, short *dqcoeff_ptr);
-global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE
-sym(vp8_fast_quantize_b_impl_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov             rsi,        arg(0) ;coeff_ptr
-        movq            mm0,        [rsi]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm1,        [rax]
-
-        movq            mm3,        mm0
-        psraw           mm0,        15
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0         ; abs
-
-        movq            mm2,        mm3
-        pcmpgtw         mm1,        mm2
-
-        pandn           mm1,        mm2
-        movq            mm3,        mm1
-
-        mov             rdx,        arg(6) ;quant_ptr
-        movq            mm1,        [rdx]
-
-        mov             rcx,        arg(5) ;round_ptr
-        movq            mm2,        [rcx]
-
-        paddw           mm3,        mm2
-        pmulhuw         mm3,        mm1
-
-        pxor            mm3,        mm0
-        psubw           mm3,        mm0     ;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-        movq            mm0,        mm3
-
-        movq            [rdi],      mm3
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm2,        [rax]
-
-        pmullw          mm3,        mm2
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax],      mm3
-
-        ; next 8
-        movq            mm4,        [rsi+8]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+8]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+8]
-        movq            mm6,        [rcx+8]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+8],    mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+8]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+8],    mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+16]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+16]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+16]
-        movq            mm6,        [rcx+16]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+16],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+16]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+16],   mm7
-
-
-                ; next 8
-        movq            mm4,        [rsi+24]
-
-        mov             rax,        arg(1) ;zbin_ptr
-        movq            mm5,        [rax+24]
-
-        movq            mm7,        mm4
-        psraw           mm4,        15
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4         ; abs
-
-        movq            mm6,        mm7
-        pcmpgtw         mm5,        mm6
-
-        pandn           mm5,        mm6
-        movq            mm7,        mm5
-
-        movq            mm5,        [rdx+24]
-        movq            mm6,        [rcx+24]
-
-        paddw           mm7,        mm6
-        pmulhuw         mm7,        mm5
-
-        pxor            mm7,        mm4
-        psubw           mm7,        mm4;gain the sign back
-
-        mov             rdi,        arg(2) ;qcoeff_ptr
-
-        movq            mm1,        mm7
-        movq            [rdi+24],   mm7
-
-        mov             rax,        arg(3) ;dequant_ptr
-        movq            mm6,        [rax+24]
-
-        pmullw          mm7,        mm6
-        mov             rax,        arg(7) ;dqcoeff_ptr
-
-        movq            [rax+24],   mm7
-
-
-
-        mov             rdi,        arg(4) ;scan_mask
-        mov             rsi,        arg(2) ;qcoeff_ptr
-
-        pxor            mm5,        mm5
-        pxor            mm7,        mm7
-
-        movq            mm0,        [rsi]
-        movq            mm1,        [rsi+8]
-
-        movq            mm2,        [rdi]
-        movq            mm3,        [rdi+8];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        movq            mm5,        mm0
-
-        paddd           mm5,        mm1
-
-        movq            mm0,        [rsi+16]
-        movq            mm1,        [rsi+24]
-
-        movq            mm2,        [rdi+16]
-        movq            mm3,        [rdi+24];
-
-        pcmpeqw         mm0,        mm7
-        pcmpeqw         mm1,        mm7
-
-        pcmpeqw         mm6,        mm6
-        pxor            mm0,        mm6
-
-        pxor            mm1,        mm6
-        psrlw           mm0,        15
-
-        psrlw           mm1,        15
-        pmaddwd         mm0,        mm2
-
-        pmaddwd         mm1,        mm3
-        paddd           mm5,        mm0
-
-        paddd           mm5,        mm1
-        movq            mm0,        mm5
-
-        psrlq           mm5,        32
-        paddd           mm0,        mm5
-
-        ; eob adjustment begins here
-        movq            rcx,        mm0
-        and             rcx,        0xffff
-
-        xor             rdx,        rdx
-        sub             rdx,        rcx ; rdx=-rcx
-
-        bsr             rax,        rcx
-        inc             rax
-
-        sar             rdx,        31
-        and             rax,        rdx
-        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
-        ; following is kept as reference
-        ;    movq            rcx,        mm0
-        ;    bsr             rax,        rcx
-        ;
-        ;    mov             eob,        rax
-        ;    mov             eee,        rcx
-        ;
-        ;if(eee==0)
-        ;{
-        ;    eob=-1;
-        ;}
-        ;else if(eee<0)
-        ;{
-        ;    eob=15;
-        ;}
-        ;d->eob = eob+1;
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index bd92b398a..d2b4711b8 100644
--- a/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/libvpx/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ; void vp8_temporal_filter_apply_sse2 | arg
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
@@ -203,5 +205,5 @@ align 16
 _const_top_bit:
     times 8 dw 1<<15
 align 16
-_const_16w
+_const_16w:
     times 8 dw 16
diff --git a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
deleted file mode 100644
index 4406dd0cc..000000000
--- a/libvpx/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vpx_ports/x86.h"
-#include "vp8/encoder/block.h"
-
-int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
-                                 short *qcoeff_ptr, short *dequant_ptr,
-                                 const short *scan_mask, short *round_ptr,
-                                 short *quant_ptr, short *dqcoeff_ptr);
-void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d) {
-  const short *scan_mask = vp8_default_zig_zag_mask;
-  short *coeff_ptr = b->coeff;
-  short *zbin_ptr = b->zbin;
-  short *round_ptr = b->round;
-  short *quant_ptr = b->quant_fast;
-  short *qcoeff_ptr = d->qcoeff;
-  short *dqcoeff_ptr = d->dqcoeff;
-  short *dequant_ptr = d->dequant;
-
-  *d->eob = (char)vp8_fast_quantize_b_impl_mmx(
-      coeff_ptr, zbin_ptr, qcoeff_ptr, dequant_ptr, scan_mask,
-
-      round_ptr, quant_ptr, dqcoeff_ptr);
-}
diff --git a/libvpx/vp8/encoder/x86/quantize_ssse3.c b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index 322f0a151..d54745015 100644
--- a/libvpx/vp8/encoder/x86/quantize_ssse3.c
+++ b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -10,6 +10,7 @@
 
 #include <tmmintrin.h> /* SSSE3 */
 
+#include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 
 /* bitscan reverse (bsr) */
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index 137f5bb62..246fe6a67 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -116,6 +116,14 @@ VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/loopfilter_filters_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
 
+# common (c)
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/sixtap_filter_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c
+
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index f8475ed61..af6689fd9 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -1216,6 +1216,7 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         50,  /* rc_two_pass_vbrbias  */
         0,   /* rc_two_pass_vbrmin_section */
         400, /* rc_two_pass_vbrmax_section */
+        0,   // rc_2pass_vbr_corpus_complexity (only has meaningfull for VP9)
 
         /* keyframing settings (kf) */
         VPX_KF_AUTO, /* g_kfmode*/
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index 9ea9c7f04..f20283c1e 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -144,8 +144,7 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
     }
     si->is_kf = 0;
 
-    if (data_sz >= 10 && !(clear[0] & 0x01)) /* I-Frame */
-    {
+    if (data_sz >= 10 && !(clear[0] & 0x01)) { /* I-Frame */
       si->is_kf = 1;
 
       /* vet via sync code */
@@ -228,7 +227,8 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12,
 }
 
 static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
-                            unsigned int data_sz, vpx_codec_err_t *res) {
+                            unsigned int data_sz,
+                            volatile vpx_codec_err_t *res) {
   *res = VPX_CODEC_OK;
 
   if (ctx->fragments.count == 0) {
@@ -267,7 +267,7 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
 static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t *data, unsigned int data_sz,
                                   void *user_priv, long deadline) {
-  vpx_codec_err_t res = VPX_CODEC_OK;
+  volatile vpx_codec_err_t res;
   unsigned int resolution_change = 0;
   unsigned int w, h;
 
@@ -414,7 +414,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 #endif
 
 #if CONFIG_MULTITHREAD
-        if (pbi->b_multithreaded_rd) {
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
           vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
         }
 #else
@@ -580,7 +580,6 @@ static vpx_codec_err_t vp8_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
   }
 }
 
-extern int vp8dx_references_buffer(VP8_COMMON *oci, int ref_frame);
 static vpx_codec_err_t vp8_get_last_ref_frame(vpx_codec_alg_priv_t *ctx,
                                               va_list args) {
   int *ref_info = va_arg(args, int *);
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
index 7bd41a3fb..0dac0169d 100644
--- a/libvpx/vp8/vp8cx.mk
+++ b/libvpx/vp8/vp8cx.mk
@@ -30,6 +30,7 @@ VP8_CX_SRCS-yes += encoder/encodeintra.c
 VP8_CX_SRCS-yes += encoder/encodemb.c
 VP8_CX_SRCS-yes += encoder/encodemv.c
 VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.c
+VP8_CX_SRCS-$(CONFIG_MULTITHREAD) += encoder/ethreading.h
 VP8_CX_SRCS-yes += encoder/firstpass.c
 VP8_CX_SRCS-yes += encoder/block.h
 VP8_CX_SRCS-yes += encoder/boolhuff.h
@@ -56,11 +57,14 @@ VP8_CX_SRCS-yes += encoder/modecosts.c
 VP8_CX_SRCS-yes += encoder/onyx_if.c
 VP8_CX_SRCS-yes += encoder/pickinter.c
 VP8_CX_SRCS-yes += encoder/picklpf.c
+VP8_CX_SRCS-yes += encoder/picklpf.h
 VP8_CX_SRCS-yes += encoder/vp8_quantize.c
 VP8_CX_SRCS-yes += encoder/ratectrl.c
 VP8_CX_SRCS-yes += encoder/rdopt.c
 VP8_CX_SRCS-yes += encoder/segmentation.c
 VP8_CX_SRCS-yes += encoder/segmentation.h
+VP8_CX_SRCS-yes += common/vp8_skin_detection.c
+VP8_CX_SRCS-yes += common/vp8_skin_detection.h
 VP8_CX_SRCS-yes += encoder/tokenize.c
 VP8_CX_SRCS-yes += encoder/dct_value_cost.h
 VP8_CX_SRCS-yes += encoder/dct_value_tokens.h
@@ -68,19 +72,20 @@ VP8_CX_SRCS-yes += encoder/treewriter.c
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.h
 VP8_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/postproc.c
 VP8_CX_SRCS-yes += encoder/temporal_filter.c
+VP8_CX_SRCS-yes += encoder/temporal_filter.h
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.c
 VP8_CX_SRCS-$(CONFIG_MULTI_RES_ENCODING) += encoder/mr_dissim.h
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
 VP8_CX_SRCS_REMOVE-yes += encoder/firstpass.c
 VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.c
+VP8_CX_SRCS_REMOVE-yes += encoder/temporal_filter.h
 endif
 
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_quantize_sse2.c
-VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c
+VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp8_quantize_ssse3.c
 VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c
 
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
@@ -89,7 +94,6 @@ endif
 
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
-VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm
 
 ifeq ($(CONFIG_REALTIME_ONLY),yes)
@@ -106,6 +110,9 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
+
 ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
 endif
diff --git a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index dd1ea03b6..025254c3f 100644
--- a/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,14 +14,7 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
-
-static int16_t sinpi_1_9 = 0x14a3;
-static int16_t sinpi_2_9 = 0x26c9;
-static int16_t sinpi_3_9 = 0x3441;
-static int16_t sinpi_4_9 = 0x3b6c;
-static int16_t cospi_8_64 = 0x3b21;
-static int16_t cospi_16_64 = 0x2d41;
-static int16_t cospi_24_64 = 0x187e;
+#include "vpx_dsp/txfm_common.h"
 
 static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
   int32x4_t q8s32, q9s32;
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 66aa733b9..7345e259b 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -17,24 +17,6 @@
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_lock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
-void unlock_buffer_pool(BufferPool *const pool) {
-#if CONFIG_MULTITHREAD
-  pthread_mutex_unlock(&pool->pool_mutex);
-#else
-  (void)pool;
-#endif
-}
-
 void vp9_set_mb_mi(VP9_COMMON *cm, int width, int height) {
   const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
   const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
@@ -62,8 +44,7 @@ static int alloc_seg_map(VP9_COMMON *cm, int seg_map_size) {
   cm->prev_seg_map_idx = 1;
 
   cm->current_frame_seg_map = cm->seg_map_array[cm->seg_map_idx];
-  if (!cm->frame_parallel_decode)
-    cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
+  cm->last_frame_seg_map = cm->seg_map_array[cm->prev_seg_map_idx];
 
   return 0;
 }
@@ -77,20 +58,18 @@ static void free_seg_map(VP9_COMMON *cm) {
   }
 
   cm->current_frame_seg_map = NULL;
-
-  if (!cm->frame_parallel_decode) {
-    cm->last_frame_seg_map = NULL;
-  }
+  cm->last_frame_seg_map = NULL;
 }
 
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
   for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (pool->frame_bufs[i].ref_count > 0 &&
+    if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
       pool->release_fb_cb(pool->cb_priv, &pool->frame_bufs[i].raw_frame_buffer);
       pool->frame_bufs[i].ref_count = 0;
+      pool->frame_bufs[i].released = 1;
     }
     vpx_free(pool->frame_bufs[i].mvs);
     pool->frame_bufs[i].mvs = NULL;
@@ -176,6 +155,9 @@ fail:
 }
 
 void vp9_remove_common(VP9_COMMON *cm) {
+#if CONFIG_VP9_POSTPROC
+  vp9_free_postproc_buffers(cm);
+#endif
   vp9_free_context_buffers(cm);
 
   vpx_free(cm->fc);
@@ -186,7 +168,7 @@ void vp9_remove_common(VP9_COMMON *cm) {
 
 void vp9_init_context_buffers(VP9_COMMON *cm) {
   cm->setup_mi(cm);
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, cm->mi_rows * cm->mi_cols);
 }
 
diff --git a/libvpx/vp9/common/vp9_entropymode.c b/libvpx/vp9/common/vp9_entropymode.c
index bcb9e8f29..47cd63e94 100644
--- a/libvpx/vp9/common/vp9_entropymode.c
+++ b/libvpx/vp9/common/vp9_entropymode.c
@@ -428,7 +428,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   vp9_clearall_segfeatures(&cm->seg);
   cm->seg.abs_delta = SEGMENT_DELTADATA;
 
-  if (cm->last_frame_seg_map && !cm->frame_parallel_decode)
+  if (cm->last_frame_seg_map)
     memset(cm->last_frame_seg_map, 0, (cm->mi_rows * cm->mi_cols));
 
   if (cm->current_frame_seg_map)
@@ -457,7 +457,7 @@ void vp9_setup_past_independence(VP9_COMMON *cm) {
   }
 
   // prev_mip will only be allocated in encoder.
-  if (frame_is_intra_only(cm) && cm->prev_mip && !cm->frame_parallel_decode)
+  if (frame_is_intra_only(cm) && cm->prev_mip)
     memset(cm->prev_mip, 0,
            cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->prev_mip));
 
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index ef0297dd5..c7c343aed 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -1612,12 +1612,14 @@ void vp9_loop_filter_data_reset(
 
 void vp9_reset_lfm(VP9_COMMON *const cm) {
   if (cm->lf.filter_level) {
-    memset(cm->lf.lfm, 0, ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) *
-                              cm->lf.lfm_stride * sizeof(*cm->lf.lfm));
+    memset(cm->lf.lfm, 0,
+           ((cm->mi_rows + (MI_BLOCK_SIZE - 1)) >> 3) * cm->lf.lfm_stride *
+               sizeof(*cm->lf.lfm));
   }
 }
 
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+int vp9_loop_filter_worker(void *arg1, void *unused) {
+  LFWorkerData *const lf_data = (LFWorkerData *)arg1;
   (void)unused;
   loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                    lf_data->start, lf_data->stop, lf_data->y_only);
diff --git a/libvpx/vp9/common/vp9_loopfilter.h b/libvpx/vp9/common/vp9_loopfilter.h
index da37a6ebd..481a6cdc6 100644
--- a/libvpx/vp9/common/vp9_loopfilter.h
+++ b/libvpx/vp9/common/vp9_loopfilter.h
@@ -151,8 +151,8 @@ void vp9_loop_filter_data_reset(
     LFWorkerData *lf_data, YV12_BUFFER_CONFIG *frame_buffer,
     struct VP9Common *cm, const struct macroblockd_plane planes[MAX_MB_PLANE]);
 
-// Operates on the rows described by 'lf_data'.
-int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
+// Operates on the rows described by 'arg1' (cast to LFWorkerData *).
+int vp9_loop_filter_worker(void *arg1, void *unused);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/common/vp9_onyxc_int.h b/libvpx/vp9/common/vp9_onyxc_int.h
index 32db7b7aa..1d96d92c2 100644
--- a/libvpx/vp9/common/vp9_onyxc_int.h
+++ b/libvpx/vp9/common/vp9_onyxc_int.h
@@ -37,13 +37,10 @@ extern "C" {
 #define REF_FRAMES_LOG2 3
 #define REF_FRAMES (1 << REF_FRAMES_LOG2)
 
-// 4 scratch frames for the new frames to support a maximum of 4 cores decoding
-// in parallel, 3 for scaled references on the encoder.
-// TODO(hkuang): Add ondemand frame buffers instead of hardcoding the number
-// of framebuffers.
+// 1 scratch frame for the new frame, 3 for scaled references on the encoder.
 // TODO(jkoleszar): These 3 extra references could probably come from the
 // normal reference pool.
-#define FRAME_BUFFERS (REF_FRAMES + 7)
+#define FRAME_BUFFERS (REF_FRAMES + 4)
 
 #define FRAME_CONTEXTS_LOG2 2
 #define FRAME_CONTEXTS (1 << FRAME_CONTEXTS_LOG2)
@@ -72,30 +69,12 @@ typedef struct {
   MV_REF *mvs;
   int mi_rows;
   int mi_cols;
+  uint8_t released;
   vpx_codec_frame_buffer_t raw_frame_buffer;
   YV12_BUFFER_CONFIG buf;
-
-  // The Following variables will only be used in frame parallel decode.
-
-  // frame_worker_owner indicates which FrameWorker owns this buffer. NULL means
-  // that no FrameWorker owns, or is decoding, this buffer.
-  VPxWorker *frame_worker_owner;
-
-  // row and col indicate which position frame has been decoded to in real
-  // pixel unit. They are reset to -1 when decoding begins and set to INT_MAX
-  // when the frame is fully decoded.
-  int row;
-  int col;
 } RefCntBuffer;
 
 typedef struct BufferPool {
-// Protect BufferPool from being accessed by several FrameWorkers at
-// the same time during frame parallel decode.
-// TODO(hkuang): Try to use atomic variable instead of locking the whole pool.
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t pool_mutex;
-#endif
-
   // Private data associated with the frame buffer callbacks.
   void *cb_priv;
 
@@ -235,10 +214,6 @@ typedef struct VP9Common {
   struct loopfilter lf;
   struct segmentation seg;
 
-  // TODO(hkuang): Remove this as it is the same as frame_parallel_decode
-  // in pbi.
-  int frame_parallel_decode;  // frame-based threading.
-
   // Context probabilities for reference frame prediction
   MV_REFERENCE_FRAME comp_fixed_ref;
   MV_REFERENCE_FRAME comp_var_ref[2];
@@ -283,11 +258,6 @@ typedef struct VP9Common {
   int above_context_alloc_cols;
 } VP9_COMMON;
 
-// TODO(hkuang): Don't need to lock the whole pool after implementing atomic
-// frame reference count.
-void lock_buffer_pool(BufferPool *const pool);
-void unlock_buffer_pool(BufferPool *const pool);
-
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
   if (index < 0 || index >= REF_FRAMES) return NULL;
   if (cm->ref_frame_map[index] < 0) return NULL;
@@ -303,7 +273,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
   int i;
 
-  lock_buffer_pool(cm->buffer_pool);
   for (i = 0; i < FRAME_BUFFERS; ++i)
     if (frame_bufs[i].ref_count == 0) break;
 
@@ -314,7 +283,6 @@ static INLINE int get_free_fb(VP9_COMMON *cm) {
     i = INVALID_IDX;
   }
 
-  unlock_buffer_pool(cm->buffer_pool);
   return i;
 }
 
@@ -342,7 +310,7 @@ static INLINE void set_partition_probs(const VP9_COMMON *const cm,
   xd->partition_probs =
       frame_is_intra_only(cm)
           ? &vp9_kf_partition_probs[0]
-          : (const vpx_prob(*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
+          : (const vpx_prob(*)[PARTITION_TYPES - 1]) cm->fc->partition_prob;
 }
 
 static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index b105e5d45..dfc315eea 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -380,7 +380,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
     // if mfqe is enabled. Need to take both the quality and the speed
     // into consideration.
     if ((flags & VP9D_DEMACROBLOCK) || (flags & VP9D_DEBLOCK)) {
-      vp8_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
+      vpx_yv12_copy_frame(ppbuf, &cm->post_proc_buffer_int);
     }
     if ((flags & VP9D_DEMACROBLOCK) && cm->post_proc_buffer_int.buffer_alloc) {
       deblock_and_de_macro_block(&cm->post_proc_buffer_int, ppbuf,
@@ -390,7 +390,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
       vp9_deblock(&cm->post_proc_buffer_int, ppbuf, q,
                   cm->postproc_state.limits);
     } else {
-      vp8_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
+      vpx_yv12_copy_frame(&cm->post_proc_buffer_int, ppbuf);
     }
   } else if (flags & VP9D_DEMACROBLOCK) {
     deblock_and_de_macro_block(cm->frame_to_show, ppbuf,
@@ -399,7 +399,7 @@ int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest,
   } else if (flags & VP9D_DEBLOCK) {
     vp9_deblock(cm->frame_to_show, ppbuf, q, cm->postproc_state.limits);
   } else {
-    vp8_yv12_copy_frame(cm->frame_to_show, ppbuf);
+    vpx_yv12_copy_frame(cm->frame_to_show, ppbuf);
   }
 
   ppstate->last_base_qindex = cm->base_qindex;
diff --git a/libvpx/vp9/common/vp9_reconinter.h b/libvpx/vp9/common/vp9_reconinter.h
index 1b09b380d..bb9291a26 100644
--- a/libvpx/vp9/common/vp9_reconinter.h
+++ b/libvpx/vp9/common/vp9_reconinter.h
@@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride,
                                    const struct scale_factors *sf, int w, int h,
                                    int ref, const InterpKernel *kernel, int xs,
                                    int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h);
+  sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst,
+                                                 dst_stride, kernel, subpel_x,
+                                                 xs, subpel_y, ys, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor(
     const int subpel_x, const int subpel_y, const struct scale_factors *sf,
     int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) {
   sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y],
-      ys, w, h, bd);
+      src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w,
+      h, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index baf63e97f..22b67ecac 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vp9_common_forward_decls() {
 print <<EOF
 /*
@@ -30,6 +40,7 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
@@ -46,41 +57,24 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/;
 #
 # dct
 #
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2/;
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2/;
-  }
-} else {
-  # Force C versions if CONFIG_EMULATE_HARDWARE is 1
-  if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-  } else {
-    add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
-    specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/;
-
-    add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp9_iht16x16_256_add sse2 dspr2 msa/;
+# Force C versions if CONFIG_EMULATE_HARDWARE is 1
+add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type";
+
+add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+
+if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vp9_iht4x4_16_add sse2/;
+  specialize qw/vp9_iht8x8_64_add sse2/;
+  specialize qw/vp9_iht16x16_256_add sse2/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+    # Note that these specializations are appended to the above ones.
+    specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
+    specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+    specialize qw/vp9_iht16x16_256_add dspr2 msa/;
   }
 }
 
@@ -124,82 +118,69 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
   specialize qw/vp9_denoiser_filter neon sse2/;
 }
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_block_error avx2 sse2/;
+add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
-  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
+add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
-  add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp sse2/;
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  specialize qw/vp9_block_error avx2 sse2/;
+
+  specialize qw/vp9_block_error_fp avx2 sse2/;
 
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant neon ssse3/;
+
+  add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_highbd_block_error sse2/;
 } else {
-  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2 msa sse2/;
 
-  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
-  specialize qw/vp9_block_error_fp neon sse2/;
-
-  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+  specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
-
-  add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_fdct8x8_quant sse2 ssse3 neon/;
 }
 
 # fdct functions
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2/;
-
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2/;
+add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2/;
+add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 sse2/;
-} else {
-  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht4x4 sse2 msa/;
+add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
-  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht8x8 sse2 msa/;
+add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
-  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp9_fht16x16 sse2 msa/;
-
-  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vp9_fwht4x4 msa sse2/;
+# Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
+# is off.
+specialize qw/vp9_fht4x4 sse2/;
+specialize qw/vp9_fht8x8 sse2/;
+specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fwht4x4 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
+  # Note that these specializations are appended to the above ones.
+  specialize qw/vp9_fht4x4 msa/;
+  specialize qw/vp9_fht8x8 msa/;
+  specialize qw/vp9_fht16x16 msa/;
+  specialize qw/vp9_fwht4x4 msa/;
 }
 
 #
 # Motion search
 #
-add_proto qw/int vp9_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
-specialize qw/vp9_full_search_sad sse3 sse4_1/;
-$vp9_full_search_sad_sse3=vp9_full_search_sadx3;
-$vp9_full_search_sad_sse4_1=vp9_full_search_sadx8;
-
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp9_diamond_search_sad avx/;
 
+if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
 add_proto qw/void vp9_temporal_filter_apply/, "const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, uint32_t *accumulator, uint16_t *count";
 specialize qw/vp9_temporal_filter_apply sse4_1/;
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
@@ -227,7 +208,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # frame based scale
 #
 add_proto qw/void vp9_scale_and_extend_frame/, "const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler";
-specialize qw/vp9_scale_and_extend_frame ssse3/;
+specialize qw/vp9_scale_and_extend_frame neon ssse3/;
 
 }
 # end encoder functions
diff --git a/libvpx/vp9/common/vp9_thread_common.c b/libvpx/vp9/common/vp9_thread_common.c
index 07e659d23..8d44e91f2 100644
--- a/libvpx/vp9/common/vp9_thread_common.c
+++ b/libvpx/vp9/common/vp9_thread_common.c
@@ -140,8 +140,9 @@ static INLINE void thread_loop_filter_rows(
 }
 
 // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(VP9LfSync *const lf_sync,
-                                  LFWorkerData *const lf_data) {
+static int loop_filter_row_worker(void *arg1, void *arg2) {
+  VP9LfSync *const lf_sync = (VP9LfSync *)arg1;
+  LFWorkerData *const lf_data = (LFWorkerData *)arg2;
   thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                           lf_data->start, lf_data->stop, lf_data->y_only,
                           lf_sync);
@@ -183,7 +184,7 @@ static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
     VPxWorker *const worker = &workers[i];
     LFWorkerData *const lf_data = &lf_sync->lfdata[i];
 
-    worker->hook = (VPxWorkerHook)loop_filter_row_worker;
+    worker->hook = loop_filter_row_worker;
     worker->data1 = lf_sync;
     worker->data2 = lf_data;
 
diff --git a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
index bb2dcf52b..6996260e2 100644
--- a/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/libvpx/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -18,8 +18,8 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   __m128i in[2];
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -54,18 +54,17 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
   __m128i in[8];
-  const __m128i zero = _mm_setzero_si128();
   const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
   // load input data
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 1);
-  in[2] = load_input_data(input + 8 * 2);
-  in[3] = load_input_data(input + 8 * 3);
-  in[4] = load_input_data(input + 8 * 4);
-  in[5] = load_input_data(input + 8 * 5);
-  in[6] = load_input_data(input + 8 * 6);
-  in[7] = load_input_data(input + 8 * 7);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8 * 1);
+  in[2] = load_input_data8(input + 8 * 2);
+  in[3] = load_input_data8(input + 8 * 3);
+  in[4] = load_input_data8(input + 8 * 4);
+  in[5] = load_input_data8(input + 8 * 5);
+  in[6] = load_input_data8(input + 8 * 6);
+  in[7] = load_input_data8(input + 8 * 7);
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -106,14 +105,91 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
   in[6] = _mm_srai_epi16(in[6], 5);
   in[7] = _mm_srai_epi16(in[7], 5);
 
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+}
+
+static INLINE void load_buffer_8x16(const tran_low_t *const input,
+                                    __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
+
+  in[8] = load_input_data8(input + 8 * 16);
+  in[9] = load_input_data8(input + 9 * 16);
+  in[10] = load_input_data8(input + 10 * 16);
+  in[11] = load_input_data8(input + 11 * 16);
+  in[12] = load_input_data8(input + 12 * 16);
+  in[13] = load_input_data8(input + 13 * 16);
+  in[14] = load_input_data8(input + 14 * 16);
+  in[15] = load_input_data8(input + 15 * 16);
+}
+
+static INLINE void write_buffer_8x16(uint8_t *const dest, __m128i *const in,
+                                     const int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  // Final rounding and shift
+  in[0] = _mm_adds_epi16(in[0], final_rounding);
+  in[1] = _mm_adds_epi16(in[1], final_rounding);
+  in[2] = _mm_adds_epi16(in[2], final_rounding);
+  in[3] = _mm_adds_epi16(in[3], final_rounding);
+  in[4] = _mm_adds_epi16(in[4], final_rounding);
+  in[5] = _mm_adds_epi16(in[5], final_rounding);
+  in[6] = _mm_adds_epi16(in[6], final_rounding);
+  in[7] = _mm_adds_epi16(in[7], final_rounding);
+  in[8] = _mm_adds_epi16(in[8], final_rounding);
+  in[9] = _mm_adds_epi16(in[9], final_rounding);
+  in[10] = _mm_adds_epi16(in[10], final_rounding);
+  in[11] = _mm_adds_epi16(in[11], final_rounding);
+  in[12] = _mm_adds_epi16(in[12], final_rounding);
+  in[13] = _mm_adds_epi16(in[13], final_rounding);
+  in[14] = _mm_adds_epi16(in[14], final_rounding);
+  in[15] = _mm_adds_epi16(in[15], final_rounding);
+
+  in[0] = _mm_srai_epi16(in[0], 6);
+  in[1] = _mm_srai_epi16(in[1], 6);
+  in[2] = _mm_srai_epi16(in[2], 6);
+  in[3] = _mm_srai_epi16(in[3], 6);
+  in[4] = _mm_srai_epi16(in[4], 6);
+  in[5] = _mm_srai_epi16(in[5], 6);
+  in[6] = _mm_srai_epi16(in[6], 6);
+  in[7] = _mm_srai_epi16(in[7], 6);
+  in[8] = _mm_srai_epi16(in[8], 6);
+  in[9] = _mm_srai_epi16(in[9], 6);
+  in[10] = _mm_srai_epi16(in[10], 6);
+  in[11] = _mm_srai_epi16(in[11], 6);
+  in[12] = _mm_srai_epi16(in[12], 6);
+  in[13] = _mm_srai_epi16(in[13], 6);
+  in[14] = _mm_srai_epi16(in[14], 6);
+  in[15] = _mm_srai_epi16(in[15], 6);
+
+  recon_and_store(dest + 0 * stride, in[0]);
+  recon_and_store(dest + 1 * stride, in[1]);
+  recon_and_store(dest + 2 * stride, in[2]);
+  recon_and_store(dest + 3 * stride, in[3]);
+  recon_and_store(dest + 4 * stride, in[4]);
+  recon_and_store(dest + 5 * stride, in[5]);
+  recon_and_store(dest + 6 * stride, in[6]);
+  recon_and_store(dest + 7 * stride, in[7]);
+  recon_and_store(dest + 8 * stride, in[8]);
+  recon_and_store(dest + 9 * stride, in[9]);
+  recon_and_store(dest + 10 * stride, in[10]);
+  recon_and_store(dest + 11 * stride, in[11]);
+  recon_and_store(dest + 12 * stride, in[12]);
+  recon_and_store(dest + 13 * stride, in[13]);
+  recon_and_store(dest + 14 * stride, in[14]);
+  recon_and_store(dest + 15 * stride, in[15]);
 }
 
 void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
diff --git a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
index 30852049b..ca0897ab9 100644
--- a/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
+++ b/libvpx/vp9/common/x86/vp9_mfqe_sse2.asm
@@ -12,6 +12,8 @@
 ;  TODO(jackychen): Find a way to fix the duplicate.
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vp9_filter_by_weight16x16_sse2
 ;(
 ;    unsigned char *src,
diff --git a/libvpx/vp9/decoder/vp9_decodeframe.c b/libvpx/vp9/decoder/vp9_decodeframe.c
index 0760f8c23..d0e896c13 100644
--- a/libvpx/vp9/decoder/vp9_decodeframe.c
+++ b/libvpx/vp9/decoder/vp9_decodeframe.c
@@ -490,8 +490,8 @@ static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void dec_build_inter_predictors(
-    VPxWorker *const worker, MACROBLOCKD *xd, int plane, int bw, int bh, int x,
-    int y, int w, int h, int mi_x, int mi_y, const InterpKernel *kernel,
+    MACROBLOCKD *xd, int plane, int bw, int bh, int x, int y, int w, int h,
+    int mi_x, int mi_y, const InterpKernel *kernel,
     const struct scale_factors *sf, struct buf_2d *pre_buf,
     struct buf_2d *dst_buf, const MV *mv, RefCntBuffer *ref_frame_buf,
     int is_scaled, int ref) {
@@ -593,12 +593,6 @@ static void dec_build_inter_predictors(
       y_pad = 1;
     }
 
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL)
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-
     // Skip border extension if block is inside the frame.
     if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
         y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
@@ -617,14 +611,6 @@ static void dec_build_inter_predictors(
                          w, h, ref, xs, ys);
       return;
     }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (worker != NULL) {
-      const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-      vp9_frameworker_wait(worker, ref_frame_buf, VPXMAX(0, (y1 + 7))
-                                                      << (plane == 0 ? 0 : 1));
-    }
   }
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -653,8 +639,6 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
   const int is_compound = has_second_ref(mi);
   int ref;
   int is_scaled;
-  VPxWorker *const fwo =
-      pbi->frame_parallel_decode ? pbi->frame_worker_owner : NULL;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
@@ -686,10 +670,10 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         for (y = 0; y < num_4x4_h; ++y) {
           for (x = 0; x < num_4x4_w; ++x) {
             const MV mv = average_split_mvs(pd, mi, ref, i++);
-            dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 4 * x,
-                                       4 * y, 4, 4, mi_x, mi_y, kernel, sf,
-                                       pre_buf, dst_buf, &mv, ref_frame_buf,
-                                       is_scaled, ref);
+            dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 4 * x, 4 * y,
+                                       4, 4, mi_x, mi_y, kernel, sf, pre_buf,
+                                       dst_buf, &mv, ref_frame_buf, is_scaled,
+                                       ref);
           }
         }
       }
@@ -703,7 +687,7 @@ static void dec_build_inter_predictors_sb(VP9Decoder *const pbi,
         const int n4w_x4 = 4 * num_4x4_w;
         const int n4h_x4 = 4 * num_4x4_h;
         struct buf_2d *const pre_buf = &pd->pre[ref];
-        dec_build_inter_predictors(fwo, xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
+        dec_build_inter_predictors(xd, plane, n4w_x4, n4h_x4, 0, 0, n4w_x4,
                                    n4h_x4, mi_x, mi_y, kernel, sf, pre_buf,
                                    dst_buf, &mv, ref_frame_buf, is_scaled, ref);
       }
@@ -1187,7 +1171,6 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1197,12 +1180,11 @@ static void setup_frame_size(VP9_COMMON *cm, struct vpx_read_bit_buffer *rb) {
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1273,7 +1255,6 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
   resize_context_buffers(cm, width, height);
   setup_render_size(cm, rb);
 
-  lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
           get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x,
           cm->subsampling_y,
@@ -1283,12 +1264,11 @@ static void setup_frame_size_with_refs(VP9_COMMON *cm,
           VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
           &pool->frame_bufs[cm->new_fb_idx].raw_frame_buffer, pool->get_fb_cb,
           pool->cb_priv)) {
-    unlock_buffer_pool(pool);
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate frame buffer");
   }
-  unlock_buffer_pool(pool);
 
+  pool->frame_bufs[cm->new_fb_idx].released = 0;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_x = cm->subsampling_x;
   pool->frame_bufs[cm->new_fb_idx].buf.subsampling_y = cm->subsampling_y;
   pool->frame_bufs[cm->new_fb_idx].buf.bit_depth = (unsigned int)cm->bit_depth;
@@ -1384,7 +1364,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
       pbi->lf_worker.data1 == NULL) {
     CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
-    pbi->lf_worker.hook = (VPxWorkerHook)vp9_loop_filter_worker;
+    pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Loop filter thread creation failed");
@@ -1473,11 +1453,6 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
           winterface->execute(&pbi->lf_worker);
         }
       }
-      // After loopfiltering, the last 7 row pixels in each superblock row may
-      // still be changed by the longest loopfilter of the next superblock
-      // row.
-      if (pbi->frame_parallel_decode)
-        vp9_frameworker_broadcast(pbi->cur_buf, mi_row << MI_BLOCK_SIZE_LOG2);
     }
   }
 
@@ -1493,16 +1468,16 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
   // Get last tile data.
   tile_data = pbi->tile_worker_data + tile_cols * tile_rows - 1;
 
-  if (pbi->frame_parallel_decode)
-    vp9_frameworker_broadcast(pbi->cur_buf, INT_MAX);
   return vpx_reader_find_end(&tile_data->bit_reader);
 }
 
 // On entry 'tile_data->data_end' points to the end of the input frame, on exit
 // it is updated to reflect the bitreader position of the final tile column if
 // present in the tile buffer group or NULL otherwise.
-static int tile_worker_hook(TileWorkerData *const tile_data,
-                            VP9Decoder *const pbi) {
+static int tile_worker_hook(void *arg1, void *arg2) {
+  TileWorkerData *const tile_data = (TileWorkerData *)arg1;
+  VP9Decoder *const pbi = (VP9Decoder *)arg2;
+
   TileInfo *volatile tile = &tile_data->xd.tile;
   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
   const uint8_t *volatile bit_reader_end = NULL;
@@ -1596,7 +1571,7 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
-    worker->hook = (VPxWorkerHook)tile_worker_hook;
+    worker->hook = tile_worker_hook;
     worker->data1 = tile_data;
     worker->data2 = pbi;
   }
@@ -1779,24 +1754,17 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
-    lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
-      unlock_buffer_pool(pool);
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
 
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
-    unlock_buffer_pool(pool);
     pbi->refresh_frame_flags = 0;
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
 
-    if (pbi->frame_parallel_decode) {
-      for (i = 0; i < REF_FRAMES; ++i)
-        cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
-    }
     return 0;
   }
 
@@ -1913,7 +1881,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
   cm->frame_context_idx = vpx_rb_read_literal(rb, FRAME_CONTEXTS_LOG2);
 
   // Generate next_ref_frame_map.
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     if (mask & 1) {
       cm->next_ref_frame_map[ref_index] = cm->new_fb_idx;
@@ -1933,7 +1900,6 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 1;
 
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
@@ -2090,24 +2056,6 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     vp9_loop_filter_frame_init(cm, cm->lf.filter_level);
   }
 
-  // If encoded in frame parallel mode, frame context is ready after decoding
-  // the frame header.
-  if (pbi->frame_parallel_decode && cm->frame_parallel_decoding_mode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    if (cm->refresh_frame_context) {
-      context_updated = 1;
-      cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
-    }
-    vp9_frameworker_lock_stats(worker);
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    frame_worker_data->frame_context_ready = 1;
-    // Signal the main thread that context is ready.
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  }
-
   if (pbi->tile_worker_data == NULL ||
       (tile_cols * tile_rows) != pbi->total_tiles) {
     const int num_tile_workers =
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 1a4152436..0a781413b 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -455,12 +455,6 @@ static void dec_find_best_ref_mvs(int allow_hp, int_mv *mvlist, int_mv *best_mv,
   }
 }
 
-static void fpm_sync(void *const data, int mi_row) {
-  VP9Decoder *const pbi = (VP9Decoder *)data;
-  vp9_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MI_BLOCK_SIZE_LOG2);
-}
-
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector or early_break
 // it will also skip all additional processing and jump to Done!
@@ -500,8 +494,7 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
                             PREDICTION_MODE mode, MV_REFERENCE_FRAME ref_frame,
                             const POSITION *const mv_ref_search,
                             int_mv *mv_ref_list, int mi_row, int mi_col,
-                            int block, int is_sub8x8, find_mv_refs_sync sync,
-                            void *const data) {
+                            int block, int is_sub8x8) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   int different_ref_found = 0;
@@ -557,23 +550,8 @@ static int dec_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd,
     }
   }
 
-// TODO(hkuang): Remove this sync after fixing pthread_cond_broadcast
-// on windows platform. The sync here is unnecessary if use_prev_frame_mvs
-// is 0. But after removing it, there will be hang in the unit test on windows
-// due to several threads waiting for a thread's signal.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  if (cm->frame_parallel_decode && sync != NULL) {
-    sync(data, mi_row);
-  }
-#endif
-
   // Check the last frame's mode and mv info.
   if (prev_frame_mvs) {
-    // Synchronize here for frame parallel decode if sync function is provided.
-    if (cm->frame_parallel_decode && sync != NULL) {
-      sync(data, mi_row);
-    }
-
     if (prev_frame_mvs->ref_frame[0] == ref_frame) {
       ADD_MV_REF_LIST_EB(prev_frame_mvs->mv[0], refmv_count, mv_ref_list, Done);
     } else if (prev_frame_mvs->ref_frame[1] == ref_frame) {
@@ -652,7 +630,7 @@ static void append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
 
   refmv_count =
       dec_find_mv_refs(cm, xd, b_mode, mi->ref_frame[ref], mv_ref_search,
-                       mv_list, mi_row, mi_col, block, 1, NULL, NULL);
+                       mv_list, mi_row, mi_col, block, 1);
 
   switch (block) {
     case 0: best_sub8x8->as_int = mv_list[refmv_count - 1].as_int; break;
@@ -750,9 +728,8 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         int refmv_count;
 
-        refmv_count =
-            dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search, tmp_mvs,
-                             mi_row, mi_col, -1, 0, fpm_sync, (void *)pbi);
+        refmv_count = dec_find_mv_refs(cm, xd, mi->mode, frame, mv_ref_search,
+                                       tmp_mvs, mi_row, mi_col, -1, 0);
 
         dec_find_best_ref_mvs(allow_hp, tmp_mvs, &best_ref_mvs[ref],
                               refmv_count);
diff --git a/libvpx/vp9/decoder/vp9_decoder.c b/libvpx/vp9/decoder/vp9_decoder.c
index 37693f094..a913fa560 100644
--- a/libvpx/vp9/decoder/vp9_decoder.c
+++ b/libvpx/vp9/decoder/vp9_decoder.c
@@ -139,6 +139,7 @@ void vp9_decoder_remove(VP9Decoder *pbi) {
     vp9_loop_filter_dealloc(&pbi->lf_row_sync);
   }
 
+  vp9_remove_common(&pbi->common);
   vpx_free(pbi);
 }
 
@@ -169,7 +170,7 @@ vpx_codec_err_t vp9_copy_reference_dec(VP9Decoder *pbi,
       vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                          "Incorrect buffer dimensions");
     else
-      vp8_yv12_copy_frame(cfg, sd);
+      vpx_yv12_copy_frame(cfg, sd);
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR, "Invalid reference frame");
   }
@@ -217,7 +218,7 @@ vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                        "Incorrect buffer dimensions");
   } else {
     // Overwrite the reference frame buffer.
-    vp8_yv12_copy_frame(sd, ref_buf);
+    vpx_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
@@ -230,7 +231,6 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
   BufferPool *const pool = cm->buffer_pool;
   RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
 
-  lock_buffer_pool(pool);
   for (mask = pbi->refresh_frame_flags; mask; mask >>= 1) {
     const int old_idx = cm->ref_frame_map[ref_index];
     // Current thread releases the holding of reference frame.
@@ -250,15 +250,10 @@ static void swap_frame_buffers(VP9Decoder *pbi) {
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
-  unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
-  if (!pbi->frame_parallel_decode || !cm->show_frame) {
-    lock_buffer_pool(pool);
-    --frame_bufs[cm->new_fb_idx].ref_count;
-    unlock_buffer_pool(pool);
-  }
+  --frame_bufs[cm->new_fb_idx].ref_count;
 
   // Invalidate these references until the next frame starts.
   for (ref_index = 0; ref_index < 3; ref_index++)
@@ -292,11 +287,13 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   pbi->ready_for_new_data = 0;
 
   // Check if the previous frame was a frame without any references to it.
-  // Release frame buffer if not decoding in frame parallel mode.
-  if (!pbi->frame_parallel_decode && cm->new_fb_idx >= 0 &&
-      frame_bufs[cm->new_fb_idx].ref_count == 0)
+  if (cm->new_fb_idx >= 0 && frame_bufs[cm->new_fb_idx].ref_count == 0 &&
+      !frame_bufs[cm->new_fb_idx].released) {
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+    frame_bufs[cm->new_fb_idx].released = 1;
+  }
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX) {
@@ -309,18 +306,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];
 
   pbi->hold_ref_buf = 0;
-  if (pbi->frame_parallel_decode) {
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    vp9_frameworker_lock_stats(worker);
-    frame_bufs[cm->new_fb_idx].frame_worker_owner = worker;
-    // Reset decoding progress.
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-    pbi->cur_buf->row = -1;
-    pbi->cur_buf->col = -1;
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
-  }
+  pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
 
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
@@ -336,7 +322,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
       winterface->sync(&pbi->tile_workers[i]);
     }
 
-    lock_buffer_pool(pool);
     // Release all the reference buffers if worker thread is holding them.
     if (pbi->hold_ref_buf == 1) {
       int ref_index = 0, mask;
@@ -361,7 +346,6 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
     }
     // Release current frame.
     decrease_ref_count(cm->new_fb_idx, frame_bufs, pool);
-    unlock_buffer_pool(pool);
 
     vpx_clear_system_state();
     return -1;
@@ -377,31 +361,14 @@ int vp9_receive_compressed_data(VP9Decoder *pbi, size_t size,
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
     cm->prev_frame = cm->cur_frame;
-    if (cm->seg.enabled && !pbi->frame_parallel_decode)
-      vp9_swap_current_and_last_seg_map(cm);
+    if (cm->seg.enabled) vp9_swap_current_and_last_seg_map(cm);
   }
 
   // Update progress in frame parallel decode.
-  if (pbi->frame_parallel_decode) {
-    // Need to lock the mutex here as another thread may
-    // be accessing this buffer.
-    VPxWorker *const worker = pbi->frame_worker_owner;
-    FrameWorkerData *const frame_worker_data = worker->data1;
-    vp9_frameworker_lock_stats(worker);
-
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
-    frame_worker_data->frame_decoded = 1;
-    frame_worker_data->frame_context_ready = 1;
-    vp9_frameworker_signal_stats(worker);
-    vp9_frameworker_unlock_stats(worker);
-  } else {
-    cm->last_width = cm->width;
-    cm->last_height = cm->height;
-    if (cm->show_frame) {
-      cm->current_video_frame++;
-    }
+  cm->last_width = cm->width;
+  cm->last_height = cm->height;
+  if (cm->show_frame) {
+    cm->current_video_frame++;
   }
 
   cm->error.setjmp = 0;
diff --git a/libvpx/vp9/decoder/vp9_decoder.h b/libvpx/vp9/decoder/vp9_decoder.h
index 427baf1e0..4b26c314d 100644
--- a/libvpx/vp9/decoder/vp9_decoder.h
+++ b/libvpx/vp9/decoder/vp9_decoder.h
@@ -21,7 +21,6 @@
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_ppflags.h"
-#include "vp9/decoder/vp9_dthread.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -53,13 +52,10 @@ typedef struct VP9Decoder {
 
   int refresh_frame_flags;
 
-  int frame_parallel_decode;  // frame-based threading.
-
   // TODO(hkuang): Combine this with cur_buf in macroblockd as they are
   // the same.
   RefCntBuffer *cur_buf;  //  Current decoding frame buffer.
 
-  VPxWorker *frame_worker_owner;  // frame_worker that owns this pbi.
   VPxWorker lf_worker;
   VPxWorker *tile_workers;
   TileWorkerData *tile_worker_data;
@@ -121,9 +117,10 @@ static INLINE void decrease_ref_count(int idx, RefCntBuffer *const frame_bufs,
     // But the private buffer is not set up until finish decoding header.
     // So any error happens during decoding header, the frame_bufs will not
     // have valid priv buffer.
-    if (frame_bufs[idx].ref_count == 0 &&
+    if (!frame_bufs[idx].released && frame_bufs[idx].ref_count == 0 &&
         frame_bufs[idx].raw_frame_buffer.priv) {
       pool->release_fb_cb(pool->cb_priv, &frame_bufs[idx].raw_frame_buffer);
+      frame_bufs[idx].released = 1;
     }
   }
 }
diff --git a/libvpx/vp9/decoder/vp9_dthread.c b/libvpx/vp9/decoder/vp9_dthread.c
deleted file mode 100644
index 52bc2a0f6..000000000
--- a/libvpx/vp9/decoder/vp9_dthread.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_dthread.h"
-#include "vp9/decoder/vp9_decoder.h"
-
-// #define DEBUG_THREAD
-
-// TODO(hkuang): Clean up all the #ifdef in this file.
-void vp9_frameworker_lock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_lock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_unlock_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-  pthread_mutex_unlock(&worker_data->stats_mutex);
-#else
-  (void)worker;
-#endif
-}
-
-void vp9_frameworker_signal_stats(VPxWorker *const worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const worker_data = worker->data1;
-
-// TODO(hkuang): Fix the pthread_cond_broadcast in windows wrapper.
-#if defined(_WIN32) && !HAVE_PTHREAD_H
-  pthread_cond_signal(&worker_data->stats_cond);
-#else
-  pthread_cond_broadcast(&worker_data->stats_cond);
-#endif
-
-#else
-  (void)worker;
-#endif
-}
-
-// This macro prevents thread_sanitizer from reporting known concurrent writes.
-#if defined(__has_feature)
-#if __has_feature(thread_sanitizer)
-#define BUILDING_WITH_TSAN
-#endif
-#endif
-
-// TODO(hkuang): Remove worker parameter as it is only used in debug code.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row) {
-#if CONFIG_MULTITHREAD
-  if (!ref_buf) return;
-
-#ifndef BUILDING_WITH_TSAN
-  // The following line of code will get harmless tsan error but it is the key
-  // to get best performance.
-  if (ref_buf->row >= row && ref_buf->buf.corrupted != 1) return;
-#endif
-
-  {
-    // Find the worker thread that owns the reference frame. If the reference
-    // frame has been fully decoded, it may not have owner.
-    VPxWorker *const ref_worker = ref_buf->frame_worker_owner;
-    FrameWorkerData *const ref_worker_data =
-        (FrameWorkerData *)ref_worker->data1;
-    const VP9Decoder *const pbi = ref_worker_data->pbi;
-
-#ifdef DEBUG_THREAD
-    {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      printf("%d %p worker is waiting for %d %p worker (%d)  ref %d \r\n",
-             worker_data->worker_id, worker, ref_worker_data->worker_id,
-             ref_buf->frame_worker_owner, row, ref_buf->row);
-    }
-#endif
-
-    vp9_frameworker_lock_stats(ref_worker);
-    while (ref_buf->row < row && pbi->cur_buf == ref_buf &&
-           ref_buf->buf.corrupted != 1) {
-      pthread_cond_wait(&ref_worker_data->stats_cond,
-                        &ref_worker_data->stats_mutex);
-    }
-
-    if (ref_buf->buf.corrupted == 1) {
-      FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-      vp9_frameworker_unlock_stats(ref_worker);
-      vpx_internal_error(&worker_data->pbi->common.error,
-                         VPX_CODEC_CORRUPT_FRAME,
-                         "Worker %p failed to decode frame", worker);
-    }
-    vp9_frameworker_unlock_stats(ref_worker);
-  }
-#else
-  (void)worker;
-  (void)ref_buf;
-  (void)row;
-  (void)ref_buf;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row) {
-#if CONFIG_MULTITHREAD
-  VPxWorker *worker = buf->frame_worker_owner;
-
-#ifdef DEBUG_THREAD
-  {
-    FrameWorkerData *const worker_data = (FrameWorkerData *)worker->data1;
-    printf("%d %p worker decode to (%d) \r\n", worker_data->worker_id,
-           buf->frame_worker_owner, row);
-  }
-#endif
-
-  vp9_frameworker_lock_stats(worker);
-  buf->row = row;
-  vp9_frameworker_signal_stats(worker);
-  vp9_frameworker_unlock_stats(worker);
-#else
-  (void)buf;
-  (void)row;
-#endif  // CONFIG_MULTITHREAD
-}
-
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker) {
-#if CONFIG_MULTITHREAD
-  FrameWorkerData *const src_worker_data = (FrameWorkerData *)src_worker->data1;
-  FrameWorkerData *const dst_worker_data = (FrameWorkerData *)dst_worker->data1;
-  VP9_COMMON *const src_cm = &src_worker_data->pbi->common;
-  VP9_COMMON *const dst_cm = &dst_worker_data->pbi->common;
-  int i;
-
-  // Wait until source frame's context is ready.
-  vp9_frameworker_lock_stats(src_worker);
-  while (!src_worker_data->frame_context_ready) {
-    pthread_cond_wait(&src_worker_data->stats_cond,
-                      &src_worker_data->stats_mutex);
-  }
-
-  dst_cm->last_frame_seg_map = src_cm->seg.enabled
-                                   ? src_cm->current_frame_seg_map
-                                   : src_cm->last_frame_seg_map;
-  dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
-  vp9_frameworker_unlock_stats(src_worker);
-
-  dst_cm->bit_depth = src_cm->bit_depth;
-#if CONFIG_VP9_HIGHBITDEPTH
-  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
-#endif
-  dst_cm->prev_frame =
-      src_cm->show_existing_frame ? src_cm->prev_frame : src_cm->cur_frame;
-  dst_cm->last_width =
-      !src_cm->show_existing_frame ? src_cm->width : src_cm->last_width;
-  dst_cm->last_height =
-      !src_cm->show_existing_frame ? src_cm->height : src_cm->last_height;
-  dst_cm->subsampling_x = src_cm->subsampling_x;
-  dst_cm->subsampling_y = src_cm->subsampling_y;
-  dst_cm->frame_type = src_cm->frame_type;
-  dst_cm->last_show_frame = !src_cm->show_existing_frame
-                                ? src_cm->show_frame
-                                : src_cm->last_show_frame;
-  for (i = 0; i < REF_FRAMES; ++i)
-    dst_cm->ref_frame_map[i] = src_cm->next_ref_frame_map[i];
-
-  memcpy(dst_cm->lf_info.lfthr, src_cm->lf_info.lfthr,
-         (MAX_LOOP_FILTER + 1) * sizeof(loop_filter_thresh));
-  dst_cm->lf.last_sharpness_level = src_cm->lf.sharpness_level;
-  dst_cm->lf.filter_level = src_cm->lf.filter_level;
-  memcpy(dst_cm->lf.ref_deltas, src_cm->lf.ref_deltas, MAX_REF_LF_DELTAS);
-  memcpy(dst_cm->lf.mode_deltas, src_cm->lf.mode_deltas, MAX_MODE_LF_DELTAS);
-  dst_cm->seg = src_cm->seg;
-  memcpy(dst_cm->frame_contexts, src_cm->frame_contexts,
-         FRAME_CONTEXTS * sizeof(dst_cm->frame_contexts[0]));
-#else
-  (void)dst_worker;
-  (void)src_worker;
-#endif  // CONFIG_MULTITHREAD
-}
diff --git a/libvpx/vp9/decoder/vp9_dthread.h b/libvpx/vp9/decoder/vp9_dthread.h
deleted file mode 100644
index fce0fe7fe..000000000
--- a/libvpx/vp9/decoder/vp9_dthread.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_DECODER_VP9_DTHREAD_H_
-#define VP9_DECODER_VP9_DTHREAD_H_
-
-#include "./vpx_config.h"
-#include "vpx_util/vpx_thread.h"
-#include "vpx/internal/vpx_codec_internal.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-struct VP9Common;
-struct VP9Decoder;
-
-// WorkerData for the FrameWorker thread. It contains all the information of
-// the worker and decode structures for decoding a frame.
-typedef struct FrameWorkerData {
-  struct VP9Decoder *pbi;
-  const uint8_t *data;
-  const uint8_t *data_end;
-  size_t data_size;
-  void *user_priv;
-  int result;
-  int worker_id;
-  int received_frame;
-
-  // scratch_buffer is used in frame parallel mode only.
-  // It is used to make a copy of the compressed data.
-  uint8_t *scratch_buffer;
-  size_t scratch_buffer_size;
-
-#if CONFIG_MULTITHREAD
-  pthread_mutex_t stats_mutex;
-  pthread_cond_t stats_cond;
-#endif
-
-  int frame_context_ready;  // Current frame's context is ready to read.
-  int frame_decoded;        // Finished decoding current frame.
-} FrameWorkerData;
-
-void vp9_frameworker_lock_stats(VPxWorker *const worker);
-void vp9_frameworker_unlock_stats(VPxWorker *const worker);
-void vp9_frameworker_signal_stats(VPxWorker *const worker);
-
-// Wait until ref_buf has been decoded to row in real pixel unit.
-// Note: worker may already finish decoding ref_buf and release it in order to
-// start decoding next frame. So need to check whether worker is still decoding
-// ref_buf.
-void vp9_frameworker_wait(VPxWorker *const worker, RefCntBuffer *const ref_buf,
-                          int row);
-
-// FrameWorker broadcasts its decoding progress so other workers that are
-// waiting on it can resume decoding.
-void vp9_frameworker_broadcast(RefCntBuffer *const buf, int row);
-
-// Copy necessary decoding context from src worker to dst worker.
-void vp9_frameworker_copy_context(VPxWorker *const dst_worker,
-                                  VPxWorker *const src_worker);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_DECODER_VP9_DTHREAD_H_
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
new file mode 100644
index 000000000..e46f789ba
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -0,0 +1,843 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_scale_rtcd.h"
+#include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_scale/yv12config.h"
+
+// Note: The scaling functions could write extra rows and columns in dst, which
+// exceed the right and bottom boundaries of the destination frame. We rely on
+// the following frame extension function to fix these rows and columns.
+
+static INLINE void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x2_t s = vld2q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                              const int src_stride,
+                                              uint8_t *dst,
+                                              const int dst_stride, const int w,
+                                              const int h) {
+  const int max_width = (w + 15) & ~15;
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      const uint8x16x4_t s = vld4q_u8(src);
+      vst1q_u8(dst, s.val[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_bilinear_kernel(
+    const uint8x16_t in0, const uint8x16_t in1, const uint8x16_t in2,
+    const uint8x16_t in3, const uint8x8_t coef0, const uint8x8_t coef1,
+    uint8_t *const dst) {
+  const uint16x8_t h0 = vmull_u8(vget_low_u8(in0), coef0);
+  const uint16x8_t h1 = vmull_u8(vget_high_u8(in0), coef0);
+  const uint16x8_t h2 = vmull_u8(vget_low_u8(in2), coef0);
+  const uint16x8_t h3 = vmull_u8(vget_high_u8(in2), coef0);
+  const uint16x8_t h4 = vmlal_u8(h0, vget_low_u8(in1), coef1);
+  const uint16x8_t h5 = vmlal_u8(h1, vget_high_u8(in1), coef1);
+  const uint16x8_t h6 = vmlal_u8(h2, vget_low_u8(in3), coef1);
+  const uint16x8_t h7 = vmlal_u8(h3, vget_high_u8(in3), coef1);
+
+  const uint8x8_t hor0 = vrshrn_n_u16(h4, 7);  // temp: 00 01 02 03 04 05 06 07
+  const uint8x8_t hor1 = vrshrn_n_u16(h5, 7);  // temp: 08 09 0A 0B 0C 0D 0E 0F
+  const uint8x8_t hor2 = vrshrn_n_u16(h6, 7);  // temp: 10 11 12 13 14 15 16 17
+  const uint8x8_t hor3 = vrshrn_n_u16(h7, 7);  // temp: 18 19 1A 1B 1C 1D 1E 1F
+  const uint16x8_t v0 = vmull_u8(hor0, coef0);
+  const uint16x8_t v1 = vmull_u8(hor1, coef0);
+  const uint16x8_t v2 = vmlal_u8(v0, hor2, coef1);
+  const uint16x8_t v3 = vmlal_u8(v1, hor3, coef1);
+  // dst: 0 1 2 3 4 5 6 7  8 9 A B C D E F
+  const uint8x16_t d = vcombine_u8(vrshrn_n_u16(v2, 7), vrshrn_n_u16(v3, 7));
+  vst1q_u8(dst, d);
+}
+
+static INLINE void scale_plane_2_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // 000 002 004 006 008 00A 00C 00E  010 012 014 016 018 01A 01C 01E
+      // 001 003 005 007 009 00B 00D 00F  011 013 015 017 019 01B 01D 01F
+      const uint8x16x2_t s0 = vld2q_u8(src0);
+      // 100 102 104 106 108 10A 10C 10E  110 112 114 116 118 11A 11C 11E
+      // 101 103 105 107 109 10B 10D 10F  111 113 115 117 119 11B 11D 11F
+      const uint8x16x2_t s1 = vld2q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 32;
+      src1 += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 2 * (src_stride - max_width);
+    src1 += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE void scale_plane_4_to_1_bilinear(
+    const uint8_t *const src, const int src_stride, uint8_t *dst,
+    const int dst_stride, const int w, const int h, const int16_t c0,
+    const int16_t c1) {
+  const int max_width = (w + 15) & ~15;
+  const uint8_t *src0 = src;
+  const uint8_t *src1 = src + src_stride;
+  const uint8x8_t coef0 = vdup_n_u8(c0);
+  const uint8x8_t coef1 = vdup_n_u8(c1);
+  int y = h;
+
+  assert(w && h);
+
+  do {
+    int x = max_width;
+    do {
+      // (*) -- useless
+      // 000 004 008 00C 010 014 018 01C  020 024 028 02C 030 034 038 03C
+      // 001 005 009 00D 011 015 019 01D  021 025 029 02D 031 035 039 03D
+      // 002 006 00A 00E 012 016 01A 01E  022 026 02A 02E 032 036 03A 03E (*)
+      // 003 007 00B 00F 013 017 01B 01F  023 027 02B 02F 033 037 03B 03F (*)
+      const uint8x16x4_t s0 = vld4q_u8(src0);
+      // 100 104 108 10C 110 114 118 11C  120 124 128 12C 130 134 138 13C
+      // 101 105 109 10D 111 115 119 11D  121 125 129 12D 131 135 139 13D
+      // 102 106 10A 10E 112 116 11A 11E  122 126 12A 12E 132 136 13A 13E (*)
+      // 103 107 10B 10F 113 117 11B 11F  123 127 12B 12F 133 137 13B 13F (*)
+      const uint8x16x4_t s1 = vld4q_u8(src1);
+      scale_plane_bilinear_kernel(s0.val[0], s0.val[1], s1.val[0], s1.val[1],
+                                  coef0, coef1, dst);
+      src0 += 64;
+      src1 += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src0 += 4 * (src_stride - max_width);
+    src1 += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static INLINE uint8x8_t scale_filter_bilinear(const uint8x8_t *const s,
+                                              const uint8x8_t *const coef) {
+  const uint16x8_t h0 = vmull_u8(s[0], coef[0]);
+  const uint16x8_t h1 = vmlal_u8(h0, s[1], coef[1]);
+
+  return vrshrn_n_u16(h1, 7);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[14], d[4];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  // Note: processing 4x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 2, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      transpose_u8_8x8(&s[6], &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                       &s[13]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[2], filters);  // 01 11 21 31 41 51 61 71
+      d[2] = scale_filter_8(&s[4], filters);  // 02 12 22 32 42 52 62 72
+      d[3] = scale_filter_8(&s[6], filters);  // 03 13 23 33 43 53 63 73
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      transpose_u8_8x4(&d[0], &d[1], &d[2], &d[3]);
+      vst1_lane_u32((uint32_t *)(t + 0 * width_hor), vreinterpret_u32_u8(d[0]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 1 * width_hor), vreinterpret_u32_u8(d[1]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 2 * width_hor), vreinterpret_u32_u8(d[2]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 3 * width_hor), vreinterpret_u32_u8(d[3]),
+                    0);
+      vst1_lane_u32((uint32_t *)(t + 4 * width_hor), vreinterpret_u32_u8(d[0]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 5 * width_hor), vreinterpret_u32_u8(d[1]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 6 * width_hor), vreinterpret_u32_u8(d[2]),
+                    1);
+      vst1_lane_u32((uint32_t *)(t + 7 * width_hor), vreinterpret_u32_u8(d[3]),
+                    1);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      t += 4;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, width_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[6], &s[7], &s[8], &s[9], &s[10], &s[11],
+                  &s[12], &s[13]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[2], filters);  // 10 11 12 13 14 15 16 17
+      d[2] = scale_filter_8(&s[4], filters);  // 20 21 22 23 24 25 26 27
+      d[3] = scale_filter_8(&s[6], filters);  // 30 31 32 33 34 35 36 37
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  const int16x8_t filters = vld1q_s16(coef);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[12], d[2];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  // Note: processing 2x8 is about 20% faster than processing row by row using
+  // vld4_u8().
+  do {
+    load_u8_8x8(src + 4, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_4x8(&s[0], &s[1], &s[2], &s[3], s[4], s[5], s[6], s[7]);
+    x = width_hor;
+
+    do {
+      uint8x8x2_t dd;
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      transpose_u8_8x8(&s[4], &s[5], &s[6], &s[7], &s[8], &s[9], &s[10],
+                       &s[11]);
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 10 20 30 40 50 60 70
+      d[1] = scale_filter_8(&s[4], filters);  // 01 11 21 31 41 51 61 71
+      // dd.val[0]: 00 01 20 21 40 41 60 61
+      // dd.val[1]: 10 11 30 31 50 51 70 71
+      dd = vtrn_u8(d[0], d[1]);
+      vst1_lane_u16((uint16_t *)(t + 0 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 0);
+      vst1_lane_u16((uint16_t *)(t + 1 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 0);
+      vst1_lane_u16((uint16_t *)(t + 2 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 1);
+      vst1_lane_u16((uint16_t *)(t + 3 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 1);
+      vst1_lane_u16((uint16_t *)(t + 4 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 2);
+      vst1_lane_u16((uint16_t *)(t + 5 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 2);
+      vst1_lane_u16((uint16_t *)(t + 6 * width_hor),
+                    vreinterpret_u16_u8(dd.val[0]), 3);
+      vst1_lane_u16((uint16_t *)(t + 7 * width_hor),
+                    vreinterpret_u16_u8(dd.val[1]), 3);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      t += 2;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 7 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x4(t, width_hor, &s[0], &s[1], &s[2], &s[3]);
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, width_hor, &s[4], &s[5], &s[6], &s[7], &s[8], &s[9],
+                  &s[10], &s[11]);
+      t += 8 * width_hor;
+
+      d[0] = scale_filter_8(&s[0], filters);  // 00 01 02 03 04 05 06 07
+      d[1] = scale_filter_8(&s[4], filters);  // 10 11 12 13 14 15 16 17
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+// Notes for 4 to 3 scaling:
+//
+// 1. 6 rows are calculated in each horizontal inner loop, so width_hor must be
+// multiple of 6, and no less than w.
+//
+// 2. 8 rows are calculated in each vertical inner loop, so width_ver must be
+// multiple of 8, and no less than w.
+//
+// 3. 8 columns are calculated in each horizontal inner loop for further
+// vertical scaling, so height_hor must be multiple of 8, and no less than
+// 4 * h / 3.
+//
+// 4. 6 columns are calculated in each vertical inner loop, so height_ver must
+// be multiple of 6, and no less than h.
+//
+// 5. The physical location of the last row of the 4 to 3 scaled frame is
+// decided by phase_scaler, and are always less than 1 pixel below the last row
+// of the original image.
+
+static void scale_plane_4_to_3_bilinear(const uint8_t *src,
+                                        const int src_stride, uint8_t *dst,
+                                        const int dst_stride, const int w,
+                                        const int h, const int phase_scaler,
+                                        uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We only need 1 extra row below because there are only 2 bilinear
+  // coefficients.
+  const int height_hor = (4 * h / 3 + 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[9], d[8], c[6];
+
+  assert(w && h);
+
+  c[0] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][3]);
+  c[1] = vdup_n_u8((uint8_t)vp9_filter_kernels[BILINEAR][phase_scaler][4]);
+  c[2] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[3] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 1 * step_q4) &
+                                            SUBPEL_MASK][4]);
+  c[4] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][3]);
+  c[5] = vdup_n_u8(
+      (uint8_t)vp9_filter_kernels[BILINEAR][(phase_scaler + 2 * step_q4) &
+                                            SUBPEL_MASK][4]);
+
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    src += 1;
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      load_u8_8x8(src, src_stride, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      src += 8;
+      transpose_u8_8x8(&s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7], &s[8]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3 - 1;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                  &s[7], &s[8]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_bilinear(&s[0], &c[0]);
+      d[1] =
+          scale_filter_bilinear(&s[(phase_scaler + 1 * step_q4) >> 4], &c[2]);
+      d[2] =
+          scale_filter_bilinear(&s[(phase_scaler + 2 * step_q4) >> 4], &c[4]);
+      d[3] = scale_filter_bilinear(&s[4], &c[0]);
+      d[4] = scale_filter_bilinear(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)],
+                                   &c[2]);
+      d[5] = scale_filter_bilinear(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)],
+                                   &c[4]);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 1);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = width_hor + 2;  // store 2 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  const int16x8_t filters0 =
+      vld1q_s16(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters1 =
+      vld1q_s16(coef[(phase_scaler + 1 * step_q4) & SUBPEL_MASK]);
+  const int16x8_t filters2 =
+      vld1q_s16(coef[(phase_scaler + 2 * step_q4) & SUBPEL_MASK]);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  uint8x8_t s[15], d[8];
+
+  assert(w && h);
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2;
+  d[6] = vdup_n_u8(0);
+  d[7] = vdup_n_u8(0);
+
+  // horizontal 6x8
+  do {
+    load_u8_8x8(src + 1, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                &s[6], &s[7]);
+    transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_u8_8x8(src, src_stride, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      transpose_u8_8x8(&s[7], &s[8], &s[9], &s[10], &s[11], &s[12], &s[13],
+                       &s[14]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+
+      // 00 01 02 03 04 05 xx xx
+      // 10 11 12 13 14 15 xx xx
+      // 20 21 22 23 24 25 xx xx
+      // 30 31 32 33 34 35 xx xx
+      // 40 41 42 43 44 45 xx xx
+      // 50 51 52 53 54 55 xx xx
+      // 60 61 62 63 64 65 xx xx
+      // 70 71 72 73 74 75 xx xx
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      // store 2 extra pixels
+      vst1_u8(t + 0 * stride_hor, d[0]);
+      vst1_u8(t + 1 * stride_hor, d[1]);
+      vst1_u8(t + 2 * stride_hor, d[2]);
+      vst1_u8(t + 3 * stride_hor, d[3]);
+      vst1_u8(t + 4 * stride_hor, d[4]);
+      vst1_u8(t + 5 * stride_hor, d[5]);
+      vst1_u8(t + 6 * stride_hor, d[6]);
+      vst1_u8(t + 7 * stride_hor, d[7]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      t += 6;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 7 * stride_hor + 2;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    load_u8_8x8(t, stride_hor, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                &s[7]);
+    t += 7 * stride_hor;
+    y = height_ver;
+
+    do {
+      load_u8_8x8(t, stride_hor, &s[7], &s[8], &s[9], &s[10], &s[11], &s[12],
+                  &s[13], &s[14]);
+      t += 8 * stride_hor;
+
+      d[0] = scale_filter_8(&s[0], filters0);
+      d[1] = scale_filter_8(&s[(phase_scaler + 1 * step_q4) >> 4], filters1);
+      d[2] = scale_filter_8(&s[(phase_scaler + 2 * step_q4) >> 4], filters2);
+      d[3] = scale_filter_8(&s[4], filters0);
+      d[4] =
+          scale_filter_8(&s[4 + ((phase_scaler + 1 * step_q4) >> 4)], filters1);
+      d[5] =
+          scale_filter_8(&s[4 + ((phase_scaler + 2 * step_q4) >> 4)], filters2);
+      vst1_u8(dst + 0 * dst_stride, d[0]);
+      vst1_u8(dst + 1 * dst_stride, d[1]);
+      vst1_u8(dst + 2 * dst_stride, d[2]);
+      vst1_u8(dst + 3 * dst_stride, d[3]);
+      vst1_u8(dst + 4 * dst_stride, d[4]);
+      vst1_u8(dst + 5 * dst_stride, d[5]);
+
+      s[0] = s[8];
+      s[1] = s[9];
+      s[2] = s[10];
+      s[3] = s[11];
+      s[4] = s[12];
+      s[5] = s[13];
+      s[6] = s[14];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * (4 * height_ver / 3 + 7);
+    t += 8;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
+                                     YV12_BUFFER_CONFIG *dst,
+                                     INTERP_FILTER filter_type,
+                                     int phase_scaler) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const int dst_uv_w = dst_w / 2;
+  const int dst_uv_h = dst_h / 2;
+  int scaled = 0;
+
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (2 * dst_w == src_w && 2 * dst_h == src_h) {
+    // 2 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0, c1);
+    } else {
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    uint8_t *const temp_buffer =
+        (uint8_t *)malloc(buffer_stride * buffer_height);
+    if (temp_buffer) {
+      scaled = 1;
+      if (filter_type == BILINEAR) {
+        scale_plane_4_to_3_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                    dst->y_stride, dst_w, dst_h, phase_scaler,
+                                    temp_buffer);
+        scale_plane_4_to_3_bilinear(src->u_buffer, src->uv_stride,
+                                    dst->u_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+        scale_plane_4_to_3_bilinear(src->v_buffer, src->uv_stride,
+                                    dst->v_buffer, dst->uv_stride, dst_uv_w,
+                                    dst_uv_h, phase_scaler, temp_buffer);
+      } else {
+        scale_plane_4_to_3_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+        scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                   dst->uv_stride, dst_uv_w, dst_uv_h,
+                                   vp9_filter_kernels[filter_type],
+                                   phase_scaler, temp_buffer);
+      }
+      free(temp_buffer);
+    }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
+  } else {
+    // Call c version for all other scaling ratios.
+    vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+  }
+}
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 0b175969b..97a09bdff 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -9,9 +9,10 @@
  */
 
 #include <arm_neon.h>
-
+#include <assert.h>
 #include <math.h>
 
+#include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
@@ -31,86 +32,206 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                           tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                           uint16_t *eob_ptr, const int16_t *scan,
                           const int16_t *iscan) {
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  int i;
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_one = vdupq_n_s16(1);
+  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+
+  (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // adjust for dc
+  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  // process dc and the first seven ac coeffs
+  {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+    store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
+    v_round = vmovq_n_s16(round_ptr[1]);
+    v_quant = vmovq_n_s16(quant_ptr[1]);
+    v_dequant = vmovq_n_s16(dequant_ptr[1]);
+  }
+  // now process the rest of the ac coeffs
+  for (i = 8; i < count; i += 8) {
+    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
+    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+    const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
+    const int32x4_t v_tmp_lo =
+        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
+    const int32x4_t v_tmp_hi =
+        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
+    const int16x8_t v_tmp2 =
+        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
+    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+    const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+    store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
+  }
+  {
+    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
+                                             vget_high_s16(v_eobmax_76543210));
+    const int64x1_t v_eobmax_xx32 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+    const int16x4_t v_eobmax_tmp =
+        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+    const int64x1_t v_eobmax_xxx3 =
+        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+    const int16x4_t v_eobmax_final =
+        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+  }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                                int skip_block, const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  const int16x8_t quant = vld1q_s16(quant_ptr);
+  const int16x4_t dequant = vld1_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+
+  // Process dc and the first seven ac coeffs.
+  const uint16x8_t iscan =
+      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+  const int16x8_t dequant_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+  int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+  int32x4_t dqcoeff_0, dqcoeff_1;
+  int16x8_t dqcoeff;
+  uint16x8_t eob_max;
   (void)scan;
+  (void)count;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // coeff * quant_ptr[]) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant);
+
+  // Restore sign.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+  qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+  // qcoeff * dequant[] / 2
+  dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
+  dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+  iscan_ptr += 8;
+  coeff_ptr += 8;
+  qcoeff_ptr += 8;
+  dqcoeff_ptr += 8;
+
+  {
     int i;
-    const int16x8_t v_zero = vdupq_n_s16(0);
-    const int16x8_t v_one = vdupq_n_s16(1);
-    int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-    int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-    int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-    int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    // adjust for dc
-    v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-    v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-    v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
-    // process dc and the first seven ac coeffs
-    {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
-      store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff);
-      v_round = vmovq_n_s16(round_ptr[1]);
-      v_quant = vmovq_n_s16(quant_ptr[1]);
-      v_dequant = vmovq_n_s16(dequant_ptr[1]);
-    }
-    // now process the rest of the ac coeffs
-    for (i = 8; i < count; i += 8) {
-      const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-      const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
-      const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-      const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero);
-      const int32x4_t v_tmp_lo =
-          vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-      const int32x4_t v_tmp_hi =
-          vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-      const int16x8_t v_tmp2 =
-          vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-      const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-      const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-      const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-      const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-      const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-      const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
-      v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-      store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
-      store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff);
+    const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
+    const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
+    const int16x8_t dequant_thresh =
+        vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
+
+    // Process the rest of the ac coeffs.
+    for (i = 8; i < 32 * 32; i += 8) {
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+      const int16x8_t dequant_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
+
+      int16x8_t qcoeff = vaddq_s16(coeff_abs, round);
+      int32x4_t dqcoeff_0, dqcoeff_1;
+      int16x8_t dqcoeff;
+
+      qcoeff = vqdmulhq_s16(qcoeff, quant);
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+      qcoeff = vandq_s16(qcoeff, dequant_mask);
+
+      dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
+      dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
+
+      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+      dqcoeff =
+          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+      iscan_ptr += 8;
+      coeff_ptr += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
     }
+
     {
-      const int16x4_t v_eobmax_3210 = vmax_s16(
-          vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210));
-      const int64x1_t v_eobmax_xx32 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-      const int16x4_t v_eobmax_tmp =
-          vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-      const int64x1_t v_eobmax_xxx3 =
-          vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-      const int16x4_t v_eobmax_final =
-          vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-      *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+      const uint16x4_t eob_max_0 =
+          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+      const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+      vst1_lane_u16(eob_ptr, eob_max_2, 0);
     }
-  } else {
-    memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-    memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-    *eob_ptr = 0;
   }
 }
diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.c b/libvpx/vp9/encoder/vp9_alt_ref_aq.c
index 3aeefb584..acc3764c7 100644
--- a/libvpx/vp9/encoder/vp9_alt_ref_aq.c
+++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.c
@@ -15,7 +15,7 @@ struct ALT_REF_AQ {
   int dummy;
 };
 
-struct ALT_REF_AQ *vp9_alt_ref_aq_create() {
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void) {
   return (struct ALT_REF_AQ *)vpx_malloc(sizeof(struct ALT_REF_AQ));
 }
 
diff --git a/libvpx/vp9/encoder/vp9_alt_ref_aq.h b/libvpx/vp9/encoder/vp9_alt_ref_aq.h
index 18acd8a85..e508cb44a 100644
--- a/libvpx/vp9/encoder/vp9_alt_ref_aq.h
+++ b/libvpx/vp9/encoder/vp9_alt_ref_aq.h
@@ -54,7 +54,7 @@ struct ALT_REF_AQ;
  *
  * \return Instance of the class
  */
-struct ALT_REF_AQ *vp9_alt_ref_aq_create();
+struct ALT_REF_AQ *vp9_alt_ref_aq_create(void);
 
 /*!\brief Upload segmentation_map to self object
  *
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index 048ea629f..2f2f0055a 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -425,9 +425,10 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
   int target_refresh = 0;
   double weight_segment_target = 0;
   double weight_segment = 0;
+  int thresh_low_motion = (cm->width < 720) ? 55 : 20;
   cr->apply_cyclic_refresh = 1;
   if (cm->frame_type == KEY_FRAME || cpi->svc.temporal_layer_id > 0 ||
-      (!cpi->use_svc && rc->avg_frame_low_motion < 55 &&
+      (!cpi->use_svc && rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40)) {
     cr->apply_cyclic_refresh = 0;
     return;
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 8433f4edd..d346cd57a 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -919,7 +919,9 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) {
   }
 }
 
-static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) {
+static int encode_tile_worker(void *arg1, void *arg2) {
+  VP9_COMP *cpi = (VP9_COMP *)arg1;
+  VP9BitstreamWorkerData *data = (VP9BitstreamWorkerData *)arg2;
   MACROBLOCKD *const xd = &data->xd;
   const int tile_row = 0;
   vpx_start_encode(&data->bit_writer, data->dest);
@@ -995,7 +997,7 @@ static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
       }
       worker->data1 = cpi;
       worker->data2 = data;
-      worker->hook = (VPxWorkerHook)encode_tile_worker;
+      worker->hook = encode_tile_worker;
       worker->had_error = 0;
 
       if (i < num_workers - 1) {
diff --git a/libvpx/vp9/encoder/vp9_block.h b/libvpx/vp9/encoder/vp9_block.h
index ab488f48f..724205dd5 100644
--- a/libvpx/vp9/encoder/vp9_block.h
+++ b/libvpx/vp9/encoder/vp9_block.h
@@ -172,6 +172,14 @@ struct macroblock {
 
   uint8_t last_sb_high_content;
 
+  int sb_use_mv_part;
+
+  int sb_mvcol_part;
+
+  int sb_mvrow_part;
+
+  int sb_pickmode_part;
+
   // For each superblock: saves the content value (e.g., low/high sad/sumdiff)
   // based on source sad, prior to encoding the frame.
   uint8_t content_state_sb;
@@ -181,11 +189,15 @@ struct macroblock {
   // 32x32, 9~24 for 16x16.
   uint8_t variance_low[25];
 
-  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
-  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+  uint8_t arf_frame_usage;
+  uint8_t lastgolden_frame_usage;
+
+  void (*fwd_txfm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*inv_txfm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob);
 #if CONFIG_VP9_HIGHBITDEPTH
-  void (*highbd_itxm_add)(const tran_low_t *input, uint16_t *dest, int stride,
-                          int eob, int bd);
+  void (*highbd_inv_txfm_add)(const tran_low_t *input, uint16_t *dest,
+                              int stride, int eob, int bd);
 #endif
 };
 
diff --git a/libvpx/vp9/encoder/vp9_context_tree.h b/libvpx/vp9/encoder/vp9_context_tree.h
index 9e4cbb360..73423c075 100644
--- a/libvpx/vp9/encoder/vp9_context_tree.h
+++ b/libvpx/vp9/encoder/vp9_context_tree.h
@@ -65,6 +65,7 @@ typedef struct {
   int_mv best_sse_mv;
   MV_REFERENCE_FRAME best_reference_frame;
   MV_REFERENCE_FRAME best_zeromv_reference_frame;
+  int sb_skip_denoising;
 #endif
 
   // motion vector cache for adaptive motion search control in partition
diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c
index e6933f00d..b08ccaa66 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libvpx/vp9/encoder/vp9_denoiser.c
@@ -21,8 +21,6 @@
 #include "vp9/encoder/vp9_denoiser.h"
 #include "vp9/encoder/vp9_encoder.h"
 
-// OUTPUT_YUV_DENOISED
-
 #ifdef OUTPUT_YUV_DENOISED
 static void make_grayscale(YV12_BUFFER_CONFIG *yuv);
 #endif
@@ -190,11 +188,13 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     VP9_COMMON *const cm, VP9_DENOISER *denoiser, MACROBLOCK *mb, BLOCK_SIZE bs,
     int increase_denoising, int mi_row, int mi_col, PICK_MODE_CONTEXT *ctx,
     int motion_magnitude, int is_skin, int *zeromv_filter, int consec_zeromv,
-    int num_spatial_layers, int width) {
+    int num_spatial_layers, int width, int lst_fb_idx, int gld_fb_idx,
+    int use_svc, int spatial_layer) {
   const int sse_diff = (ctx->newmv_sse == UINT_MAX)
                            ? 0
                            : ((int)ctx->zeromv_sse - (int)ctx->newmv_sse);
-  MV_REFERENCE_FRAME frame;
+  int frame;
+  int denoise_layer_idx = 0;
   MACROBLOCKD *filter_mbd = &mb->e_mbd;
   MODE_INFO *mi = filter_mbd->mi[0];
   MODE_INFO saved_mi;
@@ -202,8 +202,10 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
   struct buf_2d saved_dst[MAX_MB_PLANE];
   struct buf_2d saved_pre[MAX_MB_PLANE];
   RefBuffer *saved_block_refs[2];
+  MV_REFERENCE_FRAME saved_frame;
 
   frame = ctx->best_reference_frame;
+
   saved_mi = *mi;
 
   if (is_skin && (motion_magnitude > 0 || consec_zeromv < 4)) return COPY_BLOCK;
@@ -217,7 +219,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
 
   // If the best reference frame uses inter-prediction and there is enough of a
   // difference in sum-squared-error, use it.
-  if (frame != INTRA_FRAME &&
+  if (frame != INTRA_FRAME && frame != ALTREF_FRAME &&
       (frame != GOLDEN_FRAME || num_spatial_layers == 1) &&
       sse_diff > sse_diff_thresh(bs, increase_denoising, motion_magnitude)) {
     mi->ref_frame[0] = ctx->best_reference_frame;
@@ -228,7 +230,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     frame = ctx->best_zeromv_reference_frame;
     ctx->newmv_sse = ctx->zeromv_sse;
     // Bias to last reference.
-    if (num_spatial_layers > 1 ||
+    if (num_spatial_layers > 1 || frame == ALTREF_FRAME ||
         (frame != LAST_FRAME &&
          ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
@@ -246,6 +248,19 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
     }
   }
 
+  saved_frame = frame;
+  // When using SVC, we need to map REF_FRAME to the frame buffer index.
+  if (use_svc) {
+    if (frame == LAST_FRAME)
+      frame = lst_fb_idx + 1;
+    else if (frame == GOLDEN_FRAME)
+      frame = gld_fb_idx + 1;
+    // Shift for the second spatial layer.
+    if (num_spatial_layers - spatial_layer == 2)
+      frame = frame + denoiser->num_ref_frames;
+    denoise_layer_idx = num_spatial_layers - spatial_layer - 1;
+  }
+
   if (ctx->newmv_sse > sse_thresh(bs, increase_denoising)) {
     // Restore everything to its original state
     *mi = saved_mi;
@@ -279,20 +294,23 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
                   denoiser->running_avg_y[frame].uv_stride, mi_row, mi_col);
   filter_mbd->plane[2].pre[0].stride = denoiser->running_avg_y[frame].uv_stride;
 
-  filter_mbd->plane[0].dst.buf =
-      block_start(denoiser->mc_running_avg_y.y_buffer,
-                  denoiser->mc_running_avg_y.y_stride, mi_row, mi_col);
-  filter_mbd->plane[0].dst.stride = denoiser->mc_running_avg_y.y_stride;
-  filter_mbd->plane[1].dst.buf =
-      block_start(denoiser->mc_running_avg_y.u_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[1].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-  filter_mbd->plane[2].dst.buf =
-      block_start(denoiser->mc_running_avg_y.v_buffer,
-                  denoiser->mc_running_avg_y.uv_stride, mi_row, mi_col);
-  filter_mbd->plane[2].dst.stride = denoiser->mc_running_avg_y.uv_stride;
-
-  set_ref_ptrs(cm, filter_mbd, frame, NONE);
+  filter_mbd->plane[0].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride, mi_row, mi_col);
+  filter_mbd->plane[0].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].y_stride;
+  filter_mbd->plane[1].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].u_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[1].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+  filter_mbd->plane[2].dst.buf = block_start(
+      denoiser->mc_running_avg_y[denoise_layer_idx].v_buffer,
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride, mi_row, mi_col);
+  filter_mbd->plane[2].dst.stride =
+      denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
+
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
@@ -314,9 +332,17 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   int zeromv_filter = 0;
   VP9_DENOISER *denoiser = &cpi->denoiser;
   VP9_DENOISER_DECISION decision = COPY_BLOCK;
-  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME];
-  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y;
+
+  const int shift =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+          ? denoiser->num_ref_frames
+          : 0;
+  YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME + shift];
+  const int denoise_layer_index =
+      cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id - 1;
+  YV12_BUFFER_CONFIG mc_avg = denoiser->mc_running_avg_y[denoise_layer_index];
   uint8_t *avg_start = block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col);
+
   uint8_t *mc_avg_start =
       block_start(mc_avg.y_buffer, mc_avg.y_stride, mi_row, mi_col);
   struct buf_2d src = mb->plane[0].src;
@@ -338,8 +364,8 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
       VP9_COMMON *const cm = &cpi->common;
       int j, i;
       // Loop through the 8x8 sub-blocks.
-      const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      const int bw = num_8x8_blocks_wide_lookup[bs];
+      const int bh = num_8x8_blocks_high_lookup[bs];
       const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
       const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
       const int block_index = mi_row * cm->mi_cols + mi_col;
@@ -366,14 +392,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   }
   if (!is_skin && denoiser->denoising_level == kDenHigh) increase_denoising = 1;
 
-  // TODO(marpan): There is an issue with denoising for speed 5,
-  // due to the partitioning scheme based on pickmode.
-  // Remove this speed constraint when issue is resolved.
-  if (denoiser->denoising_level >= kDenLow && cpi->oxcf.speed > 5)
+  if (denoiser->denoising_level >= kDenLow && !ctx->sb_skip_denoising)
     decision = perform_motion_compensation(
         &cpi->common, denoiser, mb, bs, increase_denoising, mi_row, mi_col, ctx,
         motion_magnitude, is_skin, &zeromv_filter, consec_zeromv,
-        cpi->svc.number_spatial_layers, cpi->Source->y_width);
+        cpi->svc.number_spatial_layers, cpi->Source->y_width, cpi->lst_fb_idx,
+        cpi->gld_fb_idx, cpi->use_svc, cpi->svc.spatial_layer_id);
 
   if (decision == FILTER_BLOCK) {
     decision = vp9_denoiser_filter(src.buf, src.stride, mc_avg_start,
@@ -382,12 +406,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
   }
 
   if (decision == FILTER_BLOCK) {
-    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   } else {  // COPY_BLOCK
-    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0,
-                      NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2,
+    vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0,
+                      0, 0, num_4x4_blocks_wide_lookup[bs] << 2,
                       num_4x4_blocks_high_lookup[bs] << 2);
   }
   *denoiser_decision = decision;
@@ -423,7 +447,9 @@ static void swap_frame_buffer(YV12_BUFFER_CONFIG *const dest,
 void vp9_denoiser_update_frame_info(
     VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
     int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
-    int resized, int svc_base_is_key) {
+    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_base_is_key, int second_spatial_layer) {
+  const int shift = second_spatial_layer ? denoiser->num_ref_frames : 0;
   // Copy source into denoised reference buffers on KEY_FRAME or
   // if the just encoded frame was resized. For SVC, copy source if the base
   // spatial layer was key frame.
@@ -431,8 +457,10 @@ void vp9_denoiser_update_frame_info(
       svc_base_is_key) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
-    for (i = 1; i < MAX_REF_FRAMES; ++i)
-      copy_frame(&denoiser->running_avg_y[i], &src);
+    for (i = 1; i < denoiser->num_ref_frames; ++i) {
+      if (denoiser->running_avg_y[i + shift].buffer_alloc != NULL)
+        copy_frame(&denoiser->running_avg_y[i + shift], &src);
+    }
     denoiser->reset = 0;
     return;
   }
@@ -440,29 +468,29 @@ void vp9_denoiser_update_frame_info(
   // If more than one refresh occurs, must copy frame buffer.
   if ((refresh_alt_ref_frame + refresh_golden_frame + refresh_last_frame) > 1) {
     if (refresh_alt_ref_frame) {
-      copy_frame(&denoiser->running_avg_y[ALTREF_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_golden_frame) {
-      copy_frame(&denoiser->running_avg_y[GOLDEN_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_last_frame) {
-      copy_frame(&denoiser->running_avg_y[LAST_FRAME],
-                 &denoiser->running_avg_y[INTRA_FRAME]);
+      copy_frame(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                 &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   } else {
     if (refresh_alt_ref_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[ALTREF_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[alt_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_golden_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[gld_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
     if (refresh_last_frame) {
-      swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
-                        &denoiser->running_avg_y[INTRA_FRAME]);
+      swap_frame_buffer(&denoiser->running_avg_y[lst_fb_idx + 1 + shift],
+                        &denoiser->running_avg_y[INTRA_FRAME + shift]);
     }
   }
 }
@@ -491,19 +519,110 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
   }
 }
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+static int vp9_denoiser_realloc_svc_helper(VP9_COMMON *cm,
+                                           VP9_DENOISER *denoiser, int fb_idx) {
+  int fail = 0;
+  if (denoiser->running_avg_y[fb_idx].buffer_alloc == NULL) {
+    fail =
+        vpx_alloc_frame_buffer(&denoiser->running_avg_y[fb_idx], cm->width,
+                               cm->height, cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, 0);
+    if (fail) {
+      vp9_denoiser_free(denoiser);
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx) {
+  int fail = 0;
+  if (refresh_alt) {
+    // Increase the frame buffer index by 1 to map it to the buffer index in the
+    // denoiser.
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           alt_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  if (refresh_gld) {
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           gld_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  if (refresh_lst) {
+    fail = vp9_denoiser_realloc_svc_helper(cm, denoiser,
+                                           lst_fb_idx + 1 + svc_buf_shift);
+    if (fail) return 1;
+  }
+  return 0;
+}
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
                        int border) {
-  int i, fail;
+  int i, layer, fail, init_num_ref_frames;
   const int legacy_byte_alignment = 0;
+  int num_layers = 1;
+  int scaled_width = width;
+  int scaled_height = height;
+  if (use_svc) {
+    LAYER_CONTEXT *lc = &svc->layer_context[svc->spatial_layer_id *
+                                                svc->number_temporal_layers +
+                                            svc->temporal_layer_id];
+    get_layer_resolution(width, height, lc->scaling_factor_num,
+                         lc->scaling_factor_den, &scaled_width, &scaled_height);
+    // For SVC: only denoise at most 2 spatial (highest) layers.
+    if (noise_sen >= 2)
+      // Denoise from one spatial layer below the top.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 2, 0);
+    else
+      // Only denoise the top spatial layer.
+      svc->first_layer_denoise = VPXMAX(svc->number_spatial_layers - 1, 0);
+    num_layers = svc->number_spatial_layers - svc->first_layer_denoise;
+  }
   assert(denoiser != NULL);
+  denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
+  init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
+  denoiser->num_layers = num_layers;
+  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+                  vpx_calloc(denoiser->num_ref_frames * num_layers,
+                             sizeof(denoiser->running_avg_y[0])));
+  CHECK_MEM_ERROR(
+      cm, denoiser->mc_running_avg_y,
+      vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
+
+  for (layer = 0; layer < num_layers; ++layer) {
+    const int denoise_width = (layer == 0) ? width : scaled_width;
+    const int denoise_height = (layer == 0) ? height : scaled_height;
+    for (i = 0; i < init_num_ref_frames; ++i) {
+      fail = vpx_alloc_frame_buffer(
+          &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer],
+          denoise_width, denoise_height, ssx, ssy,
+#if CONFIG_VP9_HIGHBITDEPTH
+          use_highbitdepth,
+#endif
+          border, legacy_byte_alignment);
+      if (fail) {
+        vp9_denoiser_free(denoiser);
+        return 1;
+      }
+#ifdef OUTPUT_YUV_DENOISED
+      make_grayscale(&denoiser->running_avg_y[i]);
+#endif
+    }
 
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
-    fail = vpx_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height,
-                                  ssx, ssy,
+    fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y[layer],
+                                  denoise_width, denoise_height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                   use_highbitdepth,
 #endif
@@ -512,22 +631,10 @@ int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
       vp9_denoiser_free(denoiser);
       return 1;
     }
-#ifdef OUTPUT_YUV_DENOISED
-    make_grayscale(&denoiser->running_avg_y[i]);
-#endif
-  }
-
-  fail = vpx_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, ssx,
-                                ssy,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                use_highbitdepth,
-#endif
-                                border, legacy_byte_alignment);
-  if (fail) {
-    vp9_denoiser_free(denoiser);
-    return 1;
   }
 
+  // denoiser->last_source only used for noise_estimation, so only for top
+  // layer.
   fail = vpx_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                                 use_highbitdepth,
@@ -553,10 +660,18 @@ void vp9_denoiser_free(VP9_DENOISER *denoiser) {
     return;
   }
   denoiser->frame_buffer_initialized = 0;
-  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+  for (i = 0; i < denoiser->num_ref_frames * denoiser->num_layers; ++i) {
     vpx_free_frame_buffer(&denoiser->running_avg_y[i]);
   }
-  vpx_free_frame_buffer(&denoiser->mc_running_avg_y);
+  vpx_free(denoiser->running_avg_y);
+  denoiser->running_avg_y = NULL;
+
+  for (i = 0; i < denoiser->num_layers; ++i) {
+    vpx_free_frame_buffer(&denoiser->mc_running_avg_y[i]);
+  }
+
+  vpx_free(denoiser->mc_running_avg_y);
+  denoiser->mc_running_avg_y = NULL;
   vpx_free_frame_buffer(&denoiser->last_source);
 }
 
@@ -570,7 +685,8 @@ void vp9_denoiser_set_noise_level(VP9_DENOISER *denoiser, int noise_level) {
   denoiser->prev_denoising_level = denoiser->denoising_level;
 }
 
-// Scale/increase the partition threshold for denoiser speed-up.
+// Scale/increase the partition threshold
+// for denoiser speed-up.
 int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
                               int content_state, int temporal_layer_id) {
   if ((content_state == kLowSadLowSumdiff) ||
@@ -585,7 +701,8 @@ int64_t vp9_scale_part_thresh(int64_t threshold, VP9_DENOISER_LEVEL noise_level,
   }
 }
 
-//  Scale/increase the ac skip threshold for denoiser speed-up.
+//  Scale/increase the ac skip threshold for
+//  denoiser speed-up.
 int64_t vp9_scale_acskip_thresh(int64_t threshold,
                                 VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id) {
diff --git a/libvpx/vp9/encoder/vp9_denoiser.h b/libvpx/vp9/encoder/vp9_denoiser.h
index f0845e113..f4da24cbf 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.h
+++ b/libvpx/vp9/encoder/vp9_denoiser.h
@@ -21,6 +21,14 @@ extern "C" {
 
 #define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
 
+// Denoiser is used in non svc real-time mode which does not use alt-ref, so no
+// need to allocate for it, and hence we need MAX_REF_FRAME - 1
+#define NONSVC_REF_FRAMES MAX_REF_FRAMES - 1
+
+// Number of frame buffers when SVC is used. [0] for current denoised buffer and
+// [1..8] for REF_FRAMES
+#define SVC_REF_FRAMES 9
+
 typedef enum vp9_denoiser_decision {
   COPY_BLOCK,
   FILTER_BLOCK,
@@ -35,11 +43,13 @@ typedef enum vp9_denoiser_level {
 } VP9_DENOISER_LEVEL;
 
 typedef struct vp9_denoiser {
-  YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES];
-  YV12_BUFFER_CONFIG mc_running_avg_y;
+  YV12_BUFFER_CONFIG *running_avg_y;
+  YV12_BUFFER_CONFIG *mc_running_avg_y;
   YV12_BUFFER_CONFIG last_source;
   int frame_buffer_initialized;
   int reset;
+  int num_ref_frames;
+  int num_layers;
   VP9_DENOISER_LEVEL denoising_level;
   VP9_DENOISER_LEVEL prev_denoising_level;
 } VP9_DENOISER;
@@ -57,11 +67,13 @@ typedef struct {
 } VP9_PICKMODE_CTX_DEN;
 
 struct VP9_COMP;
+struct SVC;
 
 void vp9_denoiser_update_frame_info(
     VP9_DENOISER *denoiser, YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type,
     int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame,
-    int resized, int svc_base_is_key);
+    int alt_fb_idx, int gld_fb_idx, int lst_fb_idx, int resized,
+    int svc_base_is_key, int second_spatial_layer);
 
 void vp9_denoiser_denoise(struct VP9_COMP *cpi, MACROBLOCK *mb, int mi_row,
                           int mi_col, BLOCK_SIZE bs, PICK_MODE_CONTEXT *ctx,
@@ -73,8 +85,14 @@ void vp9_denoiser_update_frame_stats(MODE_INFO *mi, unsigned int sse,
                                      PREDICTION_MODE mode,
                                      PICK_MODE_CONTEXT *ctx);
 
-int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, int ssx,
-                       int ssy,
+int vp9_denoiser_realloc_svc(VP9_COMMON *cm, VP9_DENOISER *denoiser,
+                             int svc_buf_shift, int refresh_alt,
+                             int refresh_gld, int refresh_lst, int alt_fb_idx,
+                             int gld_fb_idx, int lst_fb_idx);
+
+int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
+                       int use_svc, int noise_sen, int width, int height,
+                       int ssx, int ssy,
 #if CONFIG_VP9_HIGHBITDEPTH
                        int use_highbitdepth,
 #endif
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 6215e198c..682477df1 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -125,19 +125,17 @@ static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                           const struct buf_2d *ref,
-                                           BLOCK_SIZE bs) {
+unsigned int vp9_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                  BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var =
       cpi->fn_ptr[bs].vf(ref->buf, ref->stride, VP9_VAR_OFFS, 0, &sse);
-  return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
+  return var;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
-                                                const struct buf_2d *ref,
-                                                BLOCK_SIZE bs, int bd) {
+unsigned int vp9_high_get_sby_variance(VP9_COMP *cpi, const struct buf_2d *ref,
+                                       BLOCK_SIZE bs, int bd) {
   unsigned int var, sse;
   switch (bd) {
     case 10:
@@ -157,8 +155,24 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
                              CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse);
       break;
   }
-  return (unsigned int)ROUND64_POWER_OF_TWO((int64_t)var,
-                                            num_pels_log2_lookup[bs]);
+  return var;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+unsigned int vp9_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                           const struct buf_2d *ref,
+                                           BLOCK_SIZE bs) {
+  return ROUND_POWER_OF_TWO(vp9_get_sby_variance(cpi, ref, bs),
+                            num_pels_log2_lookup[bs]);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi,
+                                                const struct buf_2d *ref,
+                                                BLOCK_SIZE bs, int bd) {
+  return (unsigned int)ROUND64_POWER_OF_TWO(
+      (int64_t)vp9_high_get_sby_variance(cpi, ref, bs, bd),
+      num_pels_log2_lookup[bs]);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -287,8 +301,12 @@ static void set_block_size(VP9_COMP *const cpi, MACROBLOCK *const x,
 }
 
 typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
+  // This struct is used for computing variance in choose_partitioning(), where
+  // the max number of samples within a superblock is 16x16 (with 4x4 avg). Even
+  // in high bitdepth, uint32_t is enough for sum_square_error (2^12 * 2^12 * 16
+  // * 16 = 2^32).
+  uint32_t sum_square_error;
+  int32_t sum_error;
   int log2_count;
   int variance;
 } var;
@@ -381,7 +399,7 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
@@ -489,8 +507,9 @@ static int set_vt_partitioning(VP9_COMP *cpi, MACROBLOCK *const x,
   return 0;
 }
 
-int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed, int width,
-                                  int height, int content_state) {
+static int64_t scale_part_thresh_sumdiff(int64_t threshold_base, int speed,
+                                         int width, int height,
+                                         int content_state) {
   if (speed >= 8) {
     if (width <= 640 && height <= 480)
       return (5 * threshold_base) >> 2;
@@ -554,6 +573,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q,
 #endif
     thresholds[0] = threshold_base;
     thresholds[2] = threshold_base << cpi->oxcf.speed;
+    if (cm->width >= 1280 && cm->height >= 720 && cpi->oxcf.speed < 7)
+      thresholds[2] = thresholds[2] << 1;
     if (cm->width <= 352 && cm->height <= 288) {
       thresholds[0] = threshold_base >> 3;
       thresholds[1] = threshold_base >> 1;
@@ -742,16 +763,7 @@ static int skin_sb_split(VP9_COMP *cpi, MACROBLOCK *x, const int low_res,
     for (i = 0; i < ymis; i += 2) {
       for (j = 0; j < xmis; j += 2) {
         int bl_index = block_index + i * cm->mi_cols + j;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        int consec_zeromv =
-            VPXMIN(cpi->consec_zero_mv[bl_index],
-                   VPXMIN(cpi->consec_zero_mv[bl_index1],
-                          VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                 cpi->consec_zero_mv[bl_index3])));
-        int is_skin = vp9_compute_skin_block(
-            ysignal, usignal, vsignal, sp, spuv, BLOCK_16X16, consec_zeromv, 0);
+        int is_skin = cpi->skin_map[bl_index];
         num_16x16_skin += is_skin;
         num_16x16_nonskin += (1 - is_skin);
         if (num_16x16_nonskin > 3) {
@@ -849,7 +861,7 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
   int start_pos = mi_row * cm->mi_stride + mi_col;
 
   const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
+  const int bs = (1 << bsl) >> 2;
   BLOCK_SIZE subsize;
   PARTITION_TYPE partition;
 
@@ -895,10 +907,7 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
     int layer = LAYER_IDS_TO_IDX(0, cpi->svc.temporal_layer_id,
                                  cpi->svc.number_temporal_layers);
     const LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
-    if (lc->is_key_frame ||
-        (cpi->svc.temporal_layer_id != cpi->svc.number_temporal_layers - 1 &&
-         cpi->svc.number_temporal_layers > 1))
-      svc_copy_allowed = 0;
+    if (lc->is_key_frame || !cpi->svc.non_reference_frame) svc_copy_allowed = 0;
     frames_since_key_thresh = cpi->svc.number_spatial_layers << 1;
   }
   if (cpi->rc.frames_since_key > frames_since_key_thresh && svc_copy_allowed &&
@@ -917,13 +926,165 @@ static int copy_partitioning(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
   return 0;
 }
 
-static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
-                                  int mi_col) {
+static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                                  BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                  int mi_row_high, int mi_col_high) {
+  VP9_COMMON *const cm = &cpi->common;
+  SVC *const svc = &cpi->svc;
+  BLOCK_SIZE *prev_part = svc->prev_partition_svc;
+  // Variables with _high are for higher resolution.
+  int bsize_high = 0;
+  int subsize_high = 0;
+  const int bsl_high = b_width_log2_lookup[bsize];
+  const int bs_high = (1 << bsl_high) >> 2;
+  const int has_rows = (mi_row_high + bs_high) < cm->mi_rows;
+  const int has_cols = (mi_col_high + bs_high) < cm->mi_cols;
+
+  const int row_boundary_block_scale_factor[BLOCK_SIZES] = {
+    13, 13, 13, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0
+  };
+  const int col_boundary_block_scale_factor[BLOCK_SIZES] = {
+    13, 13, 13, 2, 2, 0, 2, 2, 0, 2, 2, 0, 0
+  };
+  int start_pos;
+  BLOCK_SIZE bsize_low;
+  PARTITION_TYPE partition_high;
+
+  if (mi_row_high >= cm->mi_rows || mi_col_high >= cm->mi_cols) return 0;
+  if (mi_row >= (cm->mi_rows >> 1) || mi_col >= (cm->mi_cols >> 1)) return 0;
+
+  // Find corresponding (mi_col/mi_row) block down-scaled by 2x2.
+  start_pos = mi_row * (svc->mi_stride[svc->spatial_layer_id - 1]) + mi_col;
+  bsize_low = prev_part[start_pos];
+  // The block size is too big for boundaries. Do variance based partitioning.
+  if ((!has_rows || !has_cols) && bsize_low > BLOCK_16X16) return 1;
+
+  // For reference frames: return 1 (do variance-based partitioning) if the
+  // superblock is not low source sad and lower-resoln bsize is below 32x32.
+  if (!cpi->svc.non_reference_frame && !x->skip_low_source_sad &&
+      bsize_low < BLOCK_32X32)
+    return 1;
+
+  // Scale up block size by 2x2. Force 64x64 for size larger than 32x32.
+  if (bsize_low < BLOCK_32X32) {
+    bsize_high = bsize_low + 3;
+  } else if (bsize_low >= BLOCK_32X32) {
+    bsize_high = BLOCK_64X64;
+  }
+  // Scale up blocks on boundary.
+  if (!has_cols && has_rows) {
+    bsize_high = bsize_low + row_boundary_block_scale_factor[bsize_low];
+  } else if (has_cols && !has_rows) {
+    bsize_high = bsize_low + col_boundary_block_scale_factor[bsize_low];
+  } else if (!has_cols && !has_rows) {
+    bsize_high = bsize_low;
+  }
+
+  partition_high = partition_lookup[bsl_high][bsize_high];
+  subsize_high = get_subsize(bsize, partition_high);
+
+  if (subsize_high < BLOCK_8X8) {
+    set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+  } else {
+    const int bsl = b_width_log2_lookup[bsize];
+    const int bs = (1 << bsl) >> 2;
+    switch (partition_high) {
+      case PARTITION_NONE:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, bsize_high);
+        break;
+      case PARTITION_HORZ:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high + bs_high, mi_col_high,
+                         subsize_high);
+        break;
+      case PARTITION_VERT:
+        set_block_size(cpi, x, xd, mi_row_high, mi_col_high, subsize_high);
+        if (subsize_high < BLOCK_64X64)
+          set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
+                         subsize_high);
+        break;
+      case PARTITION_SPLIT:
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
+                                   mi_row_high, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col, mi_row_high + bs_high, mi_col_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row,
+                                   mi_col + (bs >> 1), mi_row_high,
+                                   mi_col_high + bs_high))
+          return 1;
+        if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row + (bs >> 1),
+                                   mi_col + (bs >> 1), mi_row_high + bs_high,
+                                   mi_col_high + bs_high))
+          return 1;
+        break;
+      default: assert(0);
+    }
+  }
+
+  return 0;
+}
+
+static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
+                                 int mi_col) {
+  VP9_COMMON *const cm = &cpi->common;
+  BLOCK_SIZE *prev_part = cpi->svc.prev_partition_svc;
+  int start_pos = mi_row * cm->mi_stride + mi_col;
+  const int bsl = b_width_log2_lookup[bsize];
+  const int bs = (1 << bsl) >> 2;
+  BLOCK_SIZE subsize;
+  PARTITION_TYPE partition;
+  const MODE_INFO *mi = NULL;
+  int xx, yy;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
+
+  mi = cm->mi_grid_visible[start_pos];
+  partition = partition_lookup[bsl][mi->sb_type];
+  subsize = get_subsize(bsize, partition);
+  if (subsize < BLOCK_8X8) {
+    prev_part[start_pos] = bsize;
+  } else {
+    switch (partition) {
+      case PARTITION_NONE:
+        prev_part[start_pos] = bsize;
+        if (bsize == BLOCK_64X64) {
+          for (xx = 0; xx < 8; xx += 4)
+            for (yy = 0; yy < 8; yy += 4) {
+              if ((mi_row + xx < cm->mi_rows) && (mi_col + yy < cm->mi_cols))
+                prev_part[start_pos + xx * cm->mi_stride + yy] = bsize;
+            }
+        }
+        break;
+      case PARTITION_HORZ:
+        prev_part[start_pos] = subsize;
+        if (mi_row + bs < cm->mi_rows)
+          prev_part[start_pos + bs * cm->mi_stride] = subsize;
+        break;
+      case PARTITION_VERT:
+        prev_part[start_pos] = subsize;
+        if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
+        break;
+      case PARTITION_SPLIT:
+        update_partition_svc(cpi, subsize, mi_row, mi_col);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
+        update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
+        update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
+        break;
+      default: assert(0);
+    }
+  }
+}
+
+static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                         int mi_row, int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   BLOCK_SIZE *prev_part = cpi->prev_partition;
   int start_pos = mi_row * cm->mi_stride + mi_col;
   const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
+  const int bs = (1 << bsl) >> 2;
   BLOCK_SIZE subsize;
   PARTITION_TYPE partition;
   const MODE_INFO *mi = NULL;
@@ -948,16 +1109,26 @@ static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
         if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
         break;
       case PARTITION_SPLIT:
-        update_prev_partition(cpi, subsize, mi_row, mi_col);
-        update_prev_partition(cpi, subsize, mi_row + bs, mi_col);
-        update_prev_partition(cpi, subsize, mi_row, mi_col + bs);
-        update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs);
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
+        update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
+        update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
         break;
       default: assert(0);
     }
   }
 }
 
+static void update_prev_partition(VP9_COMP *cpi, MACROBLOCK *x, int segment_id,
+                                  int mi_row, int mi_col, int sb_offset) {
+  update_prev_partition_helper(cpi, BLOCK_64X64, mi_row, mi_col);
+  cpi->prev_segment_id[sb_offset] = segment_id;
+  memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
+         sizeof(x->variance_low));
+  // Reset the counter for copy partitioning
+  cpi->copied_frame_cnt[sb_offset] = 0;
+}
+
 static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
                          unsigned int y_sad, int is_key_frame) {
   int i;
@@ -989,8 +1160,8 @@ static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize,
   }
 }
 
-static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
-                           int sb_offset) {
+static uint64_t avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
+                               int sb_offset) {
   unsigned int tmp_sse;
   uint64_t tmp_sad;
   unsigned int tmp_variance;
@@ -1002,7 +1173,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
   uint64_t avg_source_sad_threshold = 10000;
   uint64_t avg_source_sad_threshold2 = 12000;
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (cpi->common.use_highbitdepth) return;
+  if (cpi->common.use_highbitdepth) return 0;
 #endif
   src_y += shift;
   last_src_y += shift;
@@ -1019,8 +1190,12 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
                                                           : kHighSadHighSumdiff;
 
   // Detect large lighting change.
-  if (tmp_variance < (tmp_sse >> 3) && (tmp_sse - tmp_variance) > 10000)
+  if (cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.rc_mode == VPX_CBR && tmp_variance < (tmp_sse >> 3) &&
+      (tmp_sse - tmp_variance) > 10000)
     x->content_state_sb = kLowVarHighSumdiff;
+  else if (tmp_sad > (avg_source_sad_threshold << 1))
+    x->content_state_sb = kVeryHighSad;
 
   if (cpi->content_state_sb_fd != NULL) {
     if (tmp_sad < avg_source_sad_threshold2) {
@@ -1031,7 +1206,7 @@ static void avg_source_sad(VP9_COMP *cpi, MACROBLOCK *x, int shift,
       cpi->content_state_sb_fd[sb_offset] = 0;
     }
   }
-  return;
+  return tmp_sad;
 }
 
 // This function chooses partitioning based on the variance between source and
@@ -1042,7 +1217,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   MACROBLOCKD *xd = &x->e_mbd;
   int i, j, k, m;
   v64x64 vt;
-  v16x16 vt2[16];
+  v16x16 *vt2 = NULL;
   int force_split[21];
   int avg_32x32;
   int max_var_32x32 = 0;
@@ -1058,6 +1233,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   const uint8_t *d;
   int sp;
   int dp;
+  int compute_minmax_variance = 1;
   unsigned int y_sad = UINT_MAX;
   BLOCK_SIZE bsize = BLOCK_64X64;
   // Ref frame used in partitioning.
@@ -1082,6 +1258,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
   segment_id = xd->mi[0]->segment_id;
 
+  if (cpi->oxcf.speed >= 8 || (cpi->use_svc && cpi->svc.non_reference_frame))
+    compute_minmax_variance = 0;
+
+  memset(x->variance_low, 0, sizeof(x->variance_low));
+
   if (cpi->sf.use_source_sad && !is_key_frame) {
     int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
     content_state = x->content_state_sb;
@@ -1092,9 +1273,27 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     x->lowvar_highsumdiff = (content_state == kLowVarHighSumdiff) ? 1 : 0;
     if (cpi->content_state_sb_fd != NULL)
       x->last_sb_high_content = cpi->content_state_sb_fd[sb_offset2];
+
+    // For SVC on top spatial layer: use/scale the partition from
+    // the lower spatial resolution if svc_use_lowres_part is enabled.
+    if (cpi->sf.svc_use_lowres_part &&
+        cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1 &&
+        cpi->svc.prev_partition_svc != NULL && content_state != kVeryHighSad) {
+      if (!scale_partitioning_svc(cpi, x, xd, BLOCK_64X64, mi_row >> 1,
+                                  mi_col >> 1, mi_row, mi_col)) {
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
+        return 0;
+      }
+    }
     // If source_sad is low copy the partition without computing the y_sad.
     if (x->skip_low_source_sad && cpi->sf.copy_partition_flag &&
         copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
+      x->sb_use_mv_part = 1;
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
       return 0;
     }
   }
@@ -1110,8 +1309,6 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   // For non keyframes, disable 4x4 average for low resolution when speed = 8
   threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX;
 
-  memset(x->variance_low, 0, sizeof(x->variance_low));
-
   if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0) pixels_high += (xd->mb_to_bottom_edge >> 3);
 
@@ -1171,12 +1368,17 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
 
-    if (cpi->oxcf.speed >= 8 && !low_res)
+    if (cpi->oxcf.speed >= 8 && !low_res &&
+        x->content_state_sb != kVeryHighSad) {
       y_sad = cpi->fn_ptr[bsize].sdf(
           x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
           xd->plane[0].pre[0].stride);
-    else
+    } else {
       y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+      x->sb_use_mv_part = 1;
+      x->sb_mvcol_part = mi->mv[0].as_mv.col;
+      x->sb_mvrow_part = mi->mv[0].as_mv.row;
+    }
 
     y_sad_last = y_sad;
     // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad
@@ -1197,7 +1399,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
 
-    x->sb_is_skin = skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
+    if (cpi->use_skin_detection)
+      x->sb_is_skin =
+          skin_sb_split(cpi, x, low_res, mi_row, mi_col, force_split);
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
@@ -1212,6 +1416,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
         set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
         x->variance_low[0] = 1;
         chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+        if (cpi->sf.svc_use_lowres_part &&
+            cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+          update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+        if (cpi->sf.copy_partition_flag) {
+          update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
+        }
         return 0;
       }
     }
@@ -1223,6 +1433,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
     if (cpi->sf.copy_partition_flag && y_sad_last < cpi->vbp_threshold_copy &&
         copy_partitioning(cpi, x, xd, mi_row, mi_col, segment_id, sb_offset)) {
       chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+      if (cpi->sf.svc_use_lowres_part &&
+          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+        update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
       return 0;
     }
   } else {
@@ -1240,6 +1453,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
+  if (low_res && threshold_4x4avg < INT64_MAX)
+    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -1276,7 +1491,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           force_split[split_index] = 1;
           force_split[i + 1] = 1;
           force_split[0] = 1;
-        } else if (cpi->oxcf.speed < 8 &&
+        } else if (compute_minmax_variance &&
                    vt.split[i].split[j].part_variances.none.variance >
                        thresholds[1] &&
                    !cyclic_refresh_segment_id_boosted(segment_id)) {
@@ -1288,7 +1503,10 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                                           xd->cur_buf->flags,
 #endif
                                           pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
+          int thresh_minmax = (int)cpi->vbp_threshold_minmax;
+          if (x->content_state_sb == kVeryHighSad)
+            thresh_minmax = thresh_minmax << 1;
+          if (minmax > thresh_minmax) {
             force_split[split_index] = 1;
             force_split[i + 1] = 1;
             force_split[0] = 1;
@@ -1431,21 +1649,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) {
-    update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col);
-    cpi->prev_segment_id[sb_offset] = segment_id;
-    memcpy(&(cpi->prev_variance_low[sb_offset * 25]), x->variance_low,
-           sizeof(x->variance_low));
-    // Reset the counter for copy partitioning
-    if (cpi->copied_frame_cnt[sb_offset] == cpi->max_copied_frame)
-      cpi->copied_frame_cnt[sb_offset] = 0;
+    update_prev_partition(cpi, x, segment_id, mi_row, mi_col, sb_offset);
   }
 
+  if (cm->frame_type != KEY_FRAME && cpi->sf.svc_use_lowres_part &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2)
+    update_partition_svc(cpi, BLOCK_64X64, mi_row, mi_col);
+
   if (cpi->sf.short_circuit_low_temp_var) {
     set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition,
                           mi_col, mi_row);
   }
 
   chroma_check(cpi, x, bsize, y_sad, is_key_frame);
+  if (vt2) vpx_free(vt2);
   return 0;
 }
 
@@ -3480,7 +3697,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
 static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
                                      RD_COST *rd_cost, BLOCK_SIZE bsize,
                                      PICK_MODE_CONTEXT *ctx) {
-  if (bsize < BLOCK_16X16)
+  if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
   else
     vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
@@ -3644,6 +3861,9 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
       !force_horz_split && xss <= yss && bsize >= BLOCK_8X8;
   (void)*tp_orig;
 
+  // Avoid checking for rectangular partitions for speed >= 6.
+  if (cpi->oxcf.speed >= 6) do_rect = 0;
+
   assert(num_8x8_blocks_wide_lookup[bsize] ==
          num_8x8_blocks_high_lookup[bsize]);
 
@@ -3871,6 +4091,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
   RD_COST this_rdc;
+  BLOCK_SIZE subsize_ref =
+      (cpi->sf.adapt_partition_source_sad) ? BLOCK_8X8 : BLOCK_16X16;
 
   vp9_rd_cost_reset(&this_rdc);
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;
@@ -3884,7 +4106,7 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
                          0, INT64_MAX, pc_tree);
   } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
-             subsize >= BLOCK_16X16) {
+             subsize >= subsize_ref) {
     x->max_partition_size = BLOCK_32X32;
     x->min_partition_size = BLOCK_8X8;
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize, rd_cost,
@@ -4132,6 +4354,10 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, sb_row,
                                    sb_col_in_tile);
 
+    if (cpi->use_skin_detection) {
+      vp9_compute_skin_sb(cpi, BLOCK_16X16, mi_row, mi_col);
+    }
+
     x->source_variance = UINT_MAX;
     vp9_zero(x->pred_mv);
     vp9_rd_cost_init(&dummy_rdc);
@@ -4141,6 +4367,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     x->skip_low_source_sad = 0;
     x->lowvar_highsumdiff = 0;
     x->content_state_sb = 0;
+    x->sb_use_mv_part = 0;
+    x->sb_mvcol_part = 0;
+    x->sb_mvrow_part = 0;
+    x->sb_pickmode_part = 0;
+    x->arf_frame_usage = 0;
+    x->lastgolden_frame_usage = 0;
 
     if (seg->enabled) {
       const uint8_t *const map =
@@ -4155,7 +4387,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
     if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
       int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
       int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
-      avg_source_sad(cpi, x, shift, sb_offset2);
+      int64_t source_sad = avg_source_sad(cpi, x, shift, sb_offset2);
+      if (sf->adapt_partition_source_sad &&
+          (cpi->oxcf.rc_mode == VPX_VBR && !cpi->rc.is_src_frame_alt_ref &&
+           source_sad > sf->adapt_partition_thresh &&
+           (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)))
+        partition_search_type = REFERENCE_PARTITION;
     }
 
     // Set the partition type of the 64X64 block
@@ -4181,12 +4418,14 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
                             BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
         break;
       case REFERENCE_PARTITION:
+        x->sb_pickmode_part = 1;
         set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
         // Use nonrd_pick_partition on scene-cut for VBR mode.
         // nonrd_pick_partition does not support 4x4 partition, so avoid it
         // on key frame for now.
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
-             cm->frame_type != KEY_FRAME)) {
+             cpi->oxcf.speed < 6 && cm->frame_type != KEY_FRAME &&
+             (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
           // Use lower max_partition_size for low resoultions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
@@ -4213,12 +4452,34 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
       default: assert(0); break;
     }
 
+    // Update ref_frame usage for inter frame if this group is ARF group.
+    if (!cpi->rc.is_src_frame_alt_ref && !cpi->refresh_golden_frame &&
+        !cpi->refresh_alt_ref_frame && cpi->rc.alt_ref_gf_group &&
+        cpi->sf.use_altref_onepass) {
+      int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+      if (cpi->count_arf_frame_usage != NULL)
+        cpi->count_arf_frame_usage[sboffset] = x->arf_frame_usage;
+      if (cpi->count_lastgolden_frame_usage != NULL)
+        cpi->count_lastgolden_frame_usage[sboffset] = x->lastgolden_frame_usage;
+    }
+
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, num_sb_cols);
   }
 }
 // end RTC play code
 
+static INLINE uint32_t variance(const diff *const d) {
+  return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE uint32_t variance_highbd(diff *const d) {
+  const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
+  return (var >= 0) ? (uint32_t)var : 0;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   const VP9_COMMON *const cm = &cpi->common;
@@ -4248,14 +4509,17 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
           case VPX_BITS_8:
             vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
                                      &var16->sse, &var16->sum);
+            var16->var = variance(var16);
             break;
           case VPX_BITS_10:
             vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
+            var16->var = variance_highbd(var16);
             break;
           case VPX_BITS_12:
             vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
+            var16->var = variance_highbd(var16);
             break;
           default:
             assert(0 &&
@@ -4266,12 +4530,13 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
       } else {
         vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
                         &var16->sum);
+        var16->var = variance(var16);
       }
 #else
       vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
                       &var16->sum);
+      var16->var = variance(var16);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      var16->var = var16->sse - (((uint32_t)var16->sum * var16->sum) >> 8);
 
       if (var16->var >= VAR_HIST_MAX_BG_VAR)
         hist[VAR_HIST_BINS - 1]++;
@@ -4482,15 +4747,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth)
-    x->fwd_txm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
+    x->fwd_txfm4x4 = xd->lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
   else
-    x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-  x->highbd_itxm_add =
+    x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->highbd_inv_txfm_add =
       xd->lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-  x->fwd_txm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+  x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  x->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+  x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
 
   if (xd->lossless) x->optimize = 0;
 
@@ -4733,8 +4998,31 @@ void vp9_encode_frame(VP9_COMP *cpi) {
       }
     }
   } else {
+    FRAME_COUNTS *counts = cpi->td.counts;
     cm->reference_mode = SINGLE_REFERENCE;
+    if (cpi->allow_comp_inter_inter && cpi->sf.use_compound_nonrd_pickmode &&
+        cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+        cm->frame_type != KEY_FRAME)
+      cm->reference_mode = REFERENCE_MODE_SELECT;
+
     encode_frame_internal(cpi);
+
+    if (cm->reference_mode == REFERENCE_MODE_SELECT) {
+      int single_count_zero = 0;
+      int comp_count_zero = 0;
+      int i;
+      for (i = 0; i < COMP_INTER_CONTEXTS; i++) {
+        single_count_zero += counts->comp_inter[i][0];
+        comp_count_zero += counts->comp_inter[i][1];
+      }
+      if (comp_count_zero == 0) {
+        cm->reference_mode = SINGLE_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      } else if (single_count_zero == 0) {
+        cm->reference_mode = COMPOUND_REFERENCE;
+        vp9_zero(counts->comp_inter);
+      }
+    }
   }
 
   // If segmented AQ is enabled compute the average AQ weighting.
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 7e30499c5..f3c17f255 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -49,283 +49,258 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
                      pd->dst.buf, pd->dst.stride);
 }
 
-typedef struct vp9_token_state {
-  int64_t error;
-  int rate;
-  int16_t next;
-  int16_t token;
-  tran_low_t qc;
-  tran_low_t dqc;
-  uint8_t best_index;
-} vp9_token_state;
-
 static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
   { 10, 6 }, { 8, 5 },
 };
 
-#define UPDATE_RD_COST()                             \
-  {                                                  \
-    rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0); \
-    rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1); \
-  }
-
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb,
-                                     int idx, int token, uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = vp9_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
+// 'num' can be negative, but 'shift' must be non-negative.
+#define RIGHT_SHIFT_POSSIBLY_NEGATIVE(num, shift) \
+  ((num) >= 0) ? (num) >> (shift) : -((-(num)) >> (shift))
 
-static const int16_t band_count_table[TX_SIZES][8] = {
-  { 1, 2, 3, 4, 3, 16 - 13, 0 },
-  { 1, 2, 3, 4, 11, 64 - 21, 0 },
-  { 1, 2, 3, 4, 11, 256 - 21, 0 },
-  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
-};
-static const int16_t band_cum_count_table[TX_SIZES][8] = {
-  { 0, 1, 3, 6, 10, 13, 16, 0 },
-  { 0, 1, 3, 6, 10, 21, 64, 0 },
-  { 0, 1, 3, 6, 10, 21, 256, 0 },
-  { 0, 1, 3, 6, 10, 21, 1024, 0 },
-};
 int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
                    int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(xd->mi[0]);
-  vp9_token_state tokens[1025][2];
   uint8_t token_cache[1024];
-  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
-  const PLANE_TYPE type = get_plane_type(plane);
+  const PLANE_TYPE plane_type = get_plane_type(plane);
   const int default_eob = 16 << (tx_size << 1);
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, type, block);
+  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
-  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
-  int next = eob, sz = 0;
-  const int64_t rdmult = ((int64_t)mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const int64_t rdmult =
+      ((int64_t)mb->rdmult * plane_rd_mult[ref][plane_type]) >> 1;
   const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1;
-  int64_t error0, error1;
+  int64_t rate0, rate1;
   int16_t t0, t1;
-  int best, band = (eob < default_eob) ? band_translate[eob]
-                                       : band_translate[eob - 1];
-  int pt, i, final_eob;
+  int i, final_eob;
 #if CONFIG_VP9_HIGHBITDEPTH
   const uint16_t *cat6_high_cost = vp9_get_high_cost_table(xd->bd);
 #else
   const uint16_t *cat6_high_cost = vp9_get_high_cost_table(8);
 #endif
-  unsigned int(*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
-      mb->token_costs[tx_size][type][ref];
-  const int16_t *band_counts = &band_count_table[tx_size][band];
-  int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
-
-  token_costs += band;
-
-  assert((!type && !plane) || (type && plane));
+  unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+      mb->token_costs[tx_size][plane_type][ref];
+  unsigned int(*token_costs_cur)[2][COEFF_CONTEXTS][ENTROPY_TOKENS];
+  int64_t eob_cost0, eob_cost1;
+  const int ctx0 = ctx;
+  int64_t accu_rate = 0;
+  // Initialized to the worst possible error for the largest transform size.
+  // This ensures that it never goes negative.
+  int64_t accu_error = ((int64_t)1) << 50;
+  int64_t best_block_rd_cost = INT64_MAX;
+  int x_prev = 1;
+  tran_low_t before_best_eob_qc = 0;
+  tran_low_t before_best_eob_dqc = 0;
+
+  assert((!plane_type && !plane) || (plane_type && plane));
   assert(eob <= default_eob);
 
-  /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  /* Initialize the sentinel node of the trellis. */
-  tokens[eob][0].rate = 0;
-  tokens[eob][0].error = 0;
-  tokens[eob][0].next = default_eob;
-  tokens[eob][0].token = EOB_TOKEN;
-  tokens[eob][0].qc = 0;
-  tokens[eob][1] = tokens[eob][0];
-
   for (i = 0; i < eob; i++) {
     const int rc = scan[i];
     token_cache[rc] = vp9_pt_energy_class[vp9_get_token(qcoeff[rc])];
   }
+  final_eob = 0;
 
-  for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
+  // Initial RD cost.
+  token_costs_cur = token_costs + band_translate[0];
+  rate0 = (*token_costs_cur)[0][ctx0][EOB_TOKEN];
+  best_block_rd_cost = RDCOST(rdmult, rddiv, rate0, accu_error);
+
+  // For each token, pick one of two choices greedily:
+  // (i) First candidate: Keep current quantized value, OR
+  // (ii) Second candidate: Reduce quantized value by 1.
+  for (i = 0; i < eob; i++) {
     const int rc = scan[i];
-    int x = qcoeff[rc];
-    /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      error0 = tokens[next][0].error;
-      error1 = tokens[next][1].error;
-      /* Evaluate the first possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-      base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += (*token_costs)[0][pt][tokens[next][0].token];
-        rate1 += (*token_costs)[0][pt][tokens[next][1].token];
-      }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+    const int x = qcoeff[rc];
+    const int band_cur = band_translate[i];
+    const int ctx_cur = (i == 0) ? ctx : get_coef_context(nb, token_cache, i);
+    const int token_tree_sel_cur = (x_prev == 0);
+    token_costs_cur = token_costs + band_cur;
+    if (x == 0) {  // No need to search
+      const int token = vp9_get_token(x);
+      rate0 = (*token_costs_cur)[token_tree_sel_cur][ctx_cur][token];
+      accu_rate += rate0;
+      x_prev = 0;
+      // Note: accu_error does not change.
+    } else {
+      const int dqv = dequant_ptr[rc != 0];
+      // Compute the distortion for quantizing to 0.
+      const int diff_for_zero_raw = (0 - coeff[rc]) * (1 << shift);
+      const int diff_for_zero =
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx >>= xd->bd - 8;
-      }
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff_for_zero_raw, xd->bd - 8)
+              :
+#endif
+              diff_for_zero_raw;
+      const int64_t distortion_for_zero =
+          (int64_t)diff_for_zero * diff_for_zero;
+
+      // Compute the distortion for the first candidate
+      const int diff0_raw = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
+      const int diff0 =
+#if CONFIG_VP9_HIGHBITDEPTH
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+              ? RIGHT_SHIFT_POSSIBLY_NEGATIVE(diff0_raw, xd->bd - 8)
+              :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][0].error = d2 + (best ? error1 : error0);
-      tokens[i][0].next = next;
-      tokens[i][0].token = t0;
-      tokens[i][0].qc = x;
-      tokens[i][0].dqc = dqcoeff[rc];
-      tokens[i][0].best_index = best;
-
-      /* Evaluate the second possibility for this state. */
-      rate0 = tokens[next][0].rate;
-      rate1 = tokens[next][1].rate;
-
-      if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
-          (abs(x) * dequant_ptr[rc != 0] <
-           (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) {
-        sz = -(x < 0);
-        x -= 2 * sz + 1;
+              diff0_raw;
+      const int64_t distortion0 = (int64_t)diff0 * diff0;
+
+      // Compute the distortion for the second candidate
+      const int sign = -(x < 0);        // -1 if x is negative and 0 otherwise.
+      const int x1 = x - 2 * sign - 1;  // abs(x1) = abs(x) - 1.
+      int64_t distortion1;
+      if (x1 != 0) {
+        const int dqv_step =
+#if CONFIG_VP9_HIGHBITDEPTH
+            (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? dqv >> (xd->bd - 8)
+                                                          :
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                                                          dqv;
+        const int diff_step = (dqv_step + sign) ^ sign;
+        const int diff1 = diff0 - diff_step;
+        assert(dqv > 0);  // We aren't right shifting a negative number above.
+        distortion1 = (int64_t)diff1 * diff1;
       } else {
-        tokens[i][1] = tokens[i][0];
-        next = i;
-
-        if (!(--band_left)) {
-          --band_counts;
-          band_left = *band_counts;
-          --token_costs;
-        }
-        continue;
+        distortion1 = distortion_for_zero;
       }
-
-      /* Consider both possible successor states. */
-      if (!x) {
-        /* If we reduced this coefficient to zero, check to see if
-         *  we need to move the EOB back here.
-         */
-        t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        base_bits = 0;
-      } else {
-        base_bits = vp9_get_token_cost(x, &t0, cat6_high_cost);
-        t1 = t0;
+      {
+        // Calculate RDCost for current coeff for the two candidates.
+        const int64_t base_bits0 = vp9_get_token_cost(x, &t0, cat6_high_cost);
+        const int64_t base_bits1 = vp9_get_token_cost(x1, &t1, cat6_high_cost);
+        rate0 =
+            base_bits0 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t0];
+        rate1 =
+            base_bits1 + (*token_costs_cur)[token_tree_sel_cur][ctx_cur][t1];
       }
-      if (next < default_eob) {
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
-        }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
+      {
+        int rdcost_better_for_x1, eob_rdcost_better_for_x1;
+        int dqc0, dqc1;
+        int64_t best_eob_cost_cur;
+        int use_x1;
+
+        // Calculate RD Cost effect on the next coeff for the two candidates.
+        int64_t next_bits0 = 0;
+        int64_t next_bits1 = 0;
+        int64_t next_eob_bits0 = 0;
+        int64_t next_eob_bits1 = 0;
+        if (i < default_eob - 1) {
+          int ctx_next, token_tree_sel_next;
+          const int band_next = band_translate[i + 1];
+          const int token_next =
+              (i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
+          unsigned int(
+              *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+              token_costs + band_next;
+          token_cache[rc] = vp9_pt_energy_class[t0];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x == 0);
+          next_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          next_eob_bits0 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          token_cache[rc] = vp9_pt_energy_class[t1];
+          ctx_next = get_coef_context(nb, token_cache, i + 1);
+          token_tree_sel_next = (x1 == 0);
+          next_bits1 =
+              (*token_costs_next)[token_tree_sel_next][ctx_next][token_next];
+          if (x1 != 0) {
+            next_eob_bits1 =
+                (*token_costs_next)[token_tree_sel_next][ctx_next][EOB_TOKEN];
+          }
         }
-      }
 
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
+        // Compare the total RD costs for two candidates.
+        rd_cost0 = RDCOST(rdmult, rddiv, (rate0 + next_bits0), distortion0);
+        rd_cost1 = RDCOST(rdmult, rddiv, (rate1 + next_bits1), distortion1);
+        rdcost_better_for_x1 = (rd_cost1 < rd_cost0);
+        eob_cost0 = RDCOST(rdmult, rddiv, (accu_rate + rate0 + next_eob_bits0),
+                           (accu_error + distortion0 - distortion_for_zero));
+        eob_cost1 = eob_cost0;
+        if (x1 != 0) {
+          eob_cost1 =
+              RDCOST(rdmult, rddiv, (accu_rate + rate1 + next_eob_bits1),
+                     (accu_error + distortion1 - distortion_for_zero));
+          eob_rdcost_better_for_x1 = (eob_cost1 < eob_cost0);
+        } else {
+          eob_rdcost_better_for_x1 = 0;
+        }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-      } else {
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-      }
-#else
-      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-
-      tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
-      tokens[i][1].error = d2 + (best ? error1 : error0);
-      tokens[i][1].next = next;
-      tokens[i][1].token = best ? t1 : t0;
-      tokens[i][1].qc = x;
-
-      if (x) {
-        tran_low_t offset = dq_step[rc != 0];
-        // The 32x32 transform coefficient uses half quantization step size.
-        // Account for the rounding difference in the dequantized coefficeint
-        // value when the quantization index is dropped from an even number
-        // to an odd number.
-        if (shift & x) offset += (dequant_ptr[rc != 0] & 0x01);
-
-        if (sz == 0)
-          tokens[i][1].dqc = dqcoeff[rc] - offset;
-        else
-          tokens[i][1].dqc = dqcoeff[rc] + offset;
-      } else {
-        tokens[i][1].dqc = 0;
-      }
+        // Calculate the two candidate de-quantized values.
+        dqc0 = dqcoeff[rc];
+        dqc1 = 0;
+        if (rdcost_better_for_x1 + eob_rdcost_better_for_x1) {
+          if (x1 != 0) {
+            dqc1 = RIGHT_SHIFT_POSSIBLY_NEGATIVE(x1 * dqv, shift);
+          } else {
+            dqc1 = 0;
+          }
+        }
 
-      tokens[i][1].best_index = best;
-      /* Finally, make this the new head of the trellis. */
-      next = i;
-    } else {
-      /* There's no choice to make for a zero coefficient, so we don't
-       *  add a new trellis node, but we do need to update the costs.
-       */
-      pt = get_coef_context(nb, token_cache, i + 1);
-      t0 = tokens[next][0].token;
-      t1 = tokens[next][1].token;
-      /* Update the cost of each path if we're past the EOB token. */
-      if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate += (*token_costs)[1][pt][t0];
-        tokens[next][0].token = ZERO_TOKEN;
-      }
-      if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate += (*token_costs)[1][pt][t1];
-        tokens[next][1].token = ZERO_TOKEN;
+        // Pick and record the better quantized and de-quantized values.
+        if (rdcost_better_for_x1) {
+          qcoeff[rc] = x1;
+          dqcoeff[rc] = dqc1;
+          accu_rate += rate1;
+          accu_error += distortion1 - distortion_for_zero;
+          assert(distortion1 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t1];
+        } else {
+          accu_rate += rate0;
+          accu_error += distortion0 - distortion_for_zero;
+          assert(distortion0 <= distortion_for_zero);
+          token_cache[rc] = vp9_pt_energy_class[t0];
+        }
+        assert(accu_error >= 0);
+        x_prev = qcoeff[rc];  // Update based on selected quantized value.
+
+        use_x1 = (x1 != 0) && eob_rdcost_better_for_x1;
+        best_eob_cost_cur = use_x1 ? eob_cost1 : eob_cost0;
+
+        // Determine whether to move the eob position to i+1
+        if (best_eob_cost_cur < best_block_rd_cost) {
+          best_block_rd_cost = best_eob_cost_cur;
+          final_eob = i + 1;
+          if (use_x1) {
+            before_best_eob_qc = x1;
+            before_best_eob_dqc = dqc1;
+          } else {
+            before_best_eob_qc = x;
+            before_best_eob_dqc = dqc0;
+          }
+        }
       }
-      tokens[i][0].best_index = tokens[i][1].best_index = 0;
-      /* Don't update next, because we didn't add a new node. */
-    }
-
-    if (!(--band_left)) {
-      --band_counts;
-      band_left = *band_counts;
-      --token_costs;
     }
   }
-
-  /* Now pick the best path through the whole trellis. */
-  rate0 = tokens[next][0].rate;
-  rate1 = tokens[next][1].rate;
-  error0 = tokens[next][0].error;
-  error1 = tokens[next][1].error;
-  t0 = tokens[next][0].token;
-  t1 = tokens[next][1].token;
-  rate0 += (*token_costs)[0][ctx][t0];
-  rate1 += (*token_costs)[0][ctx][t1];
-  UPDATE_RD_COST();
-  best = rd_cost1 < rd_cost0;
-  final_eob = -1;
-
-  for (i = next; i < eob; i = next) {
-    const int x = tokens[i][best].qc;
-    const int rc = scan[i];
-    if (x) final_eob = i;
-    qcoeff[rc] = x;
-    dqcoeff[rc] = tokens[i][best].dqc;
-    next = tokens[i][best].next;
-    best = tokens[i][best].best_index;
+  assert(final_eob <= eob);
+  if (final_eob > 0) {
+    int rc;
+    assert(before_best_eob_qc != 0);
+    i = final_eob - 1;
+    rc = scan[i];
+    qcoeff[rc] = before_best_eob_qc;
+    dqcoeff[rc] = before_best_eob_dqc;
+  }
+  for (i = final_eob; i < eob; i++) {
+    int rc = scan[i];
+    qcoeff[rc] = 0;
+    dqcoeff[rc] = 0;
   }
-  final_eob++;
-
   mb->plane[plane].eobs[block] = final_eob;
   return final_eob;
 }
+#undef RIGHT_SHIFT_POSSIBLY_NEGATIVE
 
 static INLINE void fdct32x32(int rd_transform, const int16_t *src,
                              tran_low_t *dst, int src_stride) {
@@ -358,6 +333,8 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -381,7 +358,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                                scan_order->scan, scan_order->iscan);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
                                p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
                                scan_order->scan, scan_order->iscan);
@@ -411,7 +388,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
                         eob, scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                       scan_order->iscan);
@@ -432,6 +409,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     switch (tx_size) {
@@ -454,7 +434,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                                eob);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
                                p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
                                eob);
@@ -482,7 +462,7 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
                       qcoeff, dqcoeff, pd->dequant[0], eob);
       break;
@@ -503,6 +483,8 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -529,7 +511,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                               scan_order->iscan);
         break;
       case TX_4X4:
-        x->fwd_txm4x4(src_diff, coeff, diff_stride);
+        x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                               p->quant, p->quant_shift, qcoeff, dqcoeff,
                               pd->dequant, eob, scan_order->scan,
@@ -562,7 +544,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
                      scan_order->scan, scan_order->iscan);
       break;
     case TX_4X4:
-      x->fwd_txm4x4(src_diff, coeff, diff_stride);
+      x->fwd_txfm4x4(src_diff, coeff, diff_stride);
       vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                      p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                      scan_order->scan, scan_order->iscan);
@@ -655,8 +637,8 @@ static void encode_block(int plane, int block, int row, int col,
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        x->highbd_itxm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
-                           xd->bd);
+        x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
+                               xd->bd);
         break;
       default: assert(0 && "Invalid transform size");
     }
@@ -678,7 +660,7 @@ static void encode_block(int plane, int block, int row, int col,
       // this is like vp9_short_idct4x4 but has a special case around eob<=1
       // which is significant (not just an optimization) for the lossless
       // case.
-      x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+      x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
       break;
     default: assert(0 && "Invalid transform size"); break;
   }
@@ -700,12 +682,12 @@ static void encode_block_pass1(int plane, int block, int row, int col,
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      x->highbd_itxm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
-                         p->eobs[block], xd->bd);
+      x->highbd_inv_txfm_add(dqcoeff, CONVERT_TO_SHORTPTR(dst), pd->dst.stride,
+                             p->eobs[block], xd->bd);
       return;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    x->itxm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
+    x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
   }
 }
 
@@ -799,6 +781,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
       dst_stride, col, row, plane);
 
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
@@ -869,7 +854,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
           vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
                                 p->quant, p->quant_shift, qcoeff, dqcoeff,
                                 pd->dequant, eob, scan_order->scan,
@@ -883,7 +868,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1 which is significant (not just an optimization) for the
             // lossless case.
-            x->highbd_itxm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
+            x->highbd_inv_txfm_add(dqcoeff, dst16, dst_stride, *eob, xd->bd);
           } else {
             vp9_highbd_iht4x4_16_add(dqcoeff, dst16, dst_stride, tx_type,
                                      xd->bd);
@@ -951,7 +936,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
-          x->fwd_txm4x4(src_diff, coeff, diff_stride);
+          x->fwd_txfm4x4(src_diff, coeff, diff_stride);
         vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
                        p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
                        scan_order->scan, scan_order->iscan);
@@ -964,7 +949,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
           // which is significant (not just an optimization) for the lossless
           // case.
-          x->itxm_add(dqcoeff, dst, dst_stride, *eob);
+          x->inv_txfm_add(dqcoeff, dst, dst_stride, *eob);
         else
           vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
       }
diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index f57f40dbe..2ae59dd98 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c
@@ -71,7 +71,6 @@
                                        // mv. Choose a very high value for
                                        // now so that HIGH_PRECISION is always
                                        // chosen.
-// #define OUTPUT_YUV_REC
 
 #define FRAME_SIZE_FACTOR 128  // empirical params for context model threshold
 #define FRAME_RATE_FACTOR 8
@@ -80,7 +79,7 @@
 FILE *yuv_denoised_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-FILE *yuv_skinmap_file = NULL;
+static FILE *yuv_skinmap_file = NULL;
 #endif
 #ifdef OUTPUT_YUV_REC
 FILE *yuv_rec_file;
@@ -438,34 +437,37 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) {
 
 /* clang-format off */
 const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = {
-  { LEVEL_1,   829440,      36864,    200,    400,    2, 1,  4,  8 },
-  { LEVEL_1_1, 2764800,     73728,    800,    1000,   2, 1,  4,  8 },
-  { LEVEL_2,   4608000,     122880,   1800,   1500,   2, 1,  4,  8 },
-  { LEVEL_2_1, 9216000,     245760,   3600,   2800,   2, 2,  4,  8 },
-  { LEVEL_3,   20736000,    552960,   7200,   6000,   2, 4,  4,  8 },
-  { LEVEL_3_1, 36864000,    983040,   12000,  10000,  2, 4,  4,  8 },
-  { LEVEL_4,   83558400,    2228224,  18000,  16000,  4, 4,  4,  8 },
-  { LEVEL_4_1, 160432128,   2228224,  30000,  18000,  4, 4,  5,  6 },
-  { LEVEL_5,   311951360,   8912896,  60000,  36000,  6, 8,  6,  4 },
-  { LEVEL_5_1, 588251136,   8912896,  120000, 46000,  8, 8,  10, 4 },
+  //         sample rate    size   breadth  bitrate  cpb
+  { LEVEL_1,   829440,      36864,    512,   200,    400,    2, 1,  4,  8 },
+  { LEVEL_1_1, 2764800,     73728,    768,   800,    1000,   2, 1,  4,  8 },
+  { LEVEL_2,   4608000,     122880,   960,   1800,   1500,   2, 1,  4,  8 },
+  { LEVEL_2_1, 9216000,     245760,   1344,  3600,   2800,   2, 2,  4,  8 },
+  { LEVEL_3,   20736000,    552960,   2048,  7200,   6000,   2, 4,  4,  8 },
+  { LEVEL_3_1, 36864000,    983040,   2752,  12000,  10000,  2, 4,  4,  8 },
+  { LEVEL_4,   83558400,    2228224,  4160,  18000,  16000,  4, 4,  4,  8 },
+  { LEVEL_4_1, 160432128,   2228224,  4160,  30000,  18000,  4, 4,  5,  6 },
+  { LEVEL_5,   311951360,   8912896,  8384,  60000,  36000,  6, 8,  6,  4 },
+  { LEVEL_5_1, 588251136,   8912896,  8384,  120000, 46000,  8, 8,  10, 4 },
   // TODO(huisu): update max_cpb_size for level 5_2 ~ 6_2 when
   // they are finalized (currently tentative).
-  { LEVEL_5_2, 1176502272,  8912896,  180000, 90000,  8, 8,  10, 4 },
-  { LEVEL_6,   1176502272,  35651584, 180000, 90000,  8, 16, 10, 4 },
-  { LEVEL_6_1, 2353004544u, 35651584, 240000, 180000, 8, 16, 10, 4 },
-  { LEVEL_6_2, 4706009088u, 35651584, 480000, 360000, 8, 16, 10, 4 },
+  { LEVEL_5_2, 1176502272,  8912896,  8384,  180000, 90000,  8, 8,  10, 4 },
+  { LEVEL_6,   1176502272,  35651584, 16832, 180000, 90000,  8, 16, 10, 4 },
+  { LEVEL_6_1, 2353004544u, 35651584, 16832, 240000, 180000, 8, 16, 10, 4 },
+  { LEVEL_6_2, 4706009088u, 35651584, 16832, 480000, 360000, 8, 16, 10, 4 },
 };
 /* clang-format on */
 
-static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] =
-    { "The average bit-rate is too high.",
-      "The picture size is too large.",
-      "The luma sample rate is too large.",
-      "The CPB size is too large.",
-      "The compression ratio is too small",
-      "Too many column tiles are used.",
-      "The alt-ref distance is too small.",
-      "Too many reference buffers are used." };
+static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
+  "The average bit-rate is too high.",
+  "The picture size is too large.",
+  "The picture width/height is too large.",
+  "The luma sample rate is too large.",
+  "The CPB size is too large.",
+  "The compression ratio is too small",
+  "Too many column tiles are used.",
+  "The alt-ref distance is too small.",
+  "Too many reference buffers are used."
+};
 
 static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
   switch (mode) {
@@ -567,6 +569,8 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
             (double)this_level->max_luma_sample_rate *
                 (1 + SAMPLE_RATE_GRACE_P) ||
         level_spec->max_luma_picture_size > this_level->max_luma_picture_size ||
+        level_spec->max_luma_picture_breadth >
+            this_level->max_luma_picture_breadth ||
         level_spec->average_bitrate > this_level->average_bitrate ||
         level_spec->max_cpb_size > this_level->max_cpb_size ||
         level_spec->compression_ratio < this_level->compression_ratio ||
@@ -739,7 +743,9 @@ void vp9_initialize_enc(void) {
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
     vp9_entropy_mv_init();
+#if !CONFIG_REALTIME_ONLY
     vp9_temporal_filter_init();
+#endif
     init_done = 1;
   }
 }
@@ -779,9 +785,15 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   cpi->nmvsadcosts_hp[0] = NULL;
   cpi->nmvsadcosts_hp[1] = NULL;
 
+  vpx_free(cpi->skin_map);
+  cpi->skin_map = NULL;
+
   vpx_free(cpi->prev_partition);
   cpi->prev_partition = NULL;
 
+  vpx_free(cpi->svc.prev_partition_svc);
+  cpi->svc.prev_partition_svc = NULL;
+
   vpx_free(cpi->prev_segment_id);
   cpi->prev_segment_id = NULL;
 
@@ -794,6 +806,11 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
   vpx_free(cpi->content_state_sb_fd);
   cpi->content_state_sb_fd = NULL;
 
+  vpx_free(cpi->count_arf_frame_usage);
+  cpi->count_arf_frame_usage = NULL;
+  vpx_free(cpi->count_lastgolden_frame_usage);
+  cpi->count_lastgolden_frame_usage = NULL;
+
   vp9_cyclic_refresh_free(cpi->cyclic_refresh);
   cpi->cyclic_refresh = NULL;
 
@@ -911,6 +928,7 @@ static void restore_coding_context(VP9_COMP *cpi) {
   *cm->fc = cc->fc;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void configure_static_seg_features(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const RATE_CONTROL *const rc = &cpi->rc;
@@ -1034,6 +1052,7 @@ static void configure_static_seg_features(VP9_COMP *cpi) {
     }
   }
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void update_reference_segmentation_map(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1203,6 +1222,14 @@ static void set_tile_limits(VP9_COMP *cpi) {
         clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
     cm->log2_tile_rows = cpi->oxcf.tile_rows;
   }
+
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (cm->log2_tile_cols > level_tile_cols) {
+      cm->log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
+  }
 }
 
 static void update_frame_size(VP9_COMP *cpi) {
@@ -1318,14 +1345,12 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                           \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                         \
-  cpi->fn_ptr[BT].vf = VF;                                             \
-  cpi->fn_ptr[BT].svf = SVF;                                           \
-  cpi->fn_ptr[BT].svaf = SVAF;                                         \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                       \
+#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                             \
+  cpi->fn_ptr[BT].sdaf = SDAF;                           \
+  cpi->fn_ptr[BT].vf = VF;                               \
+  cpi->fn_ptr[BT].svf = SVF;                             \
+  cpi->fn_ptr[BT].svaf = SVAF;                           \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
@@ -1364,47 +1389,6 @@ static void set_rc_buffer_sizes(RATE_CONTROL *rc,
            4;                                                                  \
   }
 
-#define MAKE_BFP_SAD3_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 3; i++) sad_array[i] >>= 4;                          \
-  }
-
-#define MAKE_BFP_SAD8_WRAPPER(fnname)                                    \
-  static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,  \
-                             const uint8_t *ref_ptr, int ref_stride,     \
-                             unsigned int *sad_array) {                  \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-  }                                                                      \
-  static void fnname##_bits10(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 2;                          \
-  }                                                                      \
-  static void fnname##_bits12(const uint8_t *src_ptr, int source_stride, \
-                              const uint8_t *ref_ptr, int ref_stride,    \
-                              unsigned int *sad_array) {                 \
-    int i;                                                               \
-    fnname(src_ptr, source_stride, ref_ptr, ref_stride, sad_array);      \
-    for (i = 0; i < 8; i++) sad_array[i] >>= 4;                          \
-  }
 #define MAKE_BFP_SAD4D_WRAPPER(fnname)                                        \
   static void fnname##_bits8(const uint8_t *src_ptr, int source_stride,       \
                              const uint8_t *const ref_ptr[], int ref_stride,  \
@@ -1440,46 +1424,30 @@ MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
-MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3)
-MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
@@ -1490,253 +1458,236 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
                    vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
                    vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
                    vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
                    vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
                    vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
                    vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
                    vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
                    vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
                    vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
                    vpx_highbd_8_sub_pixel_variance32x32,
                    vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits8, vpx_highbd_sad32x32x8_bits8,
                    vpx_highbd_sad32x32x4d_bits8)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
                    vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
                    vpx_highbd_8_sub_pixel_variance64x64,
                    vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits8, vpx_highbd_sad64x64x8_bits8,
                    vpx_highbd_sad64x64x4d_bits8)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
                    vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
                    vpx_highbd_8_sub_pixel_variance16x16,
                    vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits8, vpx_highbd_sad16x16x8_bits8,
                    vpx_highbd_sad16x16x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_16X8, vpx_highbd_sad16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
-            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
-            vpx_highbd_8_sub_pixel_avg_variance16x8, vpx_highbd_sad16x8x3_bits8,
-            vpx_highbd_sad16x8x8_bits8, vpx_highbd_sad16x8x4d_bits8)
+        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
+                   vpx_highbd_sad16x8x4d_bits8)
 
-        HIGHBD_BFP(
-            BLOCK_8X16, vpx_highbd_sad8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
-            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
-            vpx_highbd_8_sub_pixel_avg_variance8x16, vpx_highbd_sad8x16x3_bits8,
-            vpx_highbd_sad8x16x8_bits8, vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
+                   vpx_highbd_sad8x16x4d_bits8)
 
         HIGHBD_BFP(
             BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
             vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits8,
-            vpx_highbd_sad8x8x8_bits8, vpx_highbd_sad8x8x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
 
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
-                   vpx_highbd_sad8x4_avg_bits8, vpx_highbd_8_variance8x4,
-                   vpx_highbd_8_sub_pixel_variance8x4,
-                   vpx_highbd_8_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits8, vpx_highbd_sad8x4x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
 
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
-                   vpx_highbd_sad4x8_avg_bits8, vpx_highbd_8_variance4x8,
-                   vpx_highbd_8_sub_pixel_variance4x8,
-                   vpx_highbd_8_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits8, vpx_highbd_sad4x8x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
 
         HIGHBD_BFP(
             BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
             vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits8,
-            vpx_highbd_sad4x4x8_bits8, vpx_highbd_sad4x4x4d_bits8)
+            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
                    vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
                    vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
                    vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
                    vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
                    vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
                    vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
                    vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
                    vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
                    vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
                    vpx_highbd_10_sub_pixel_variance32x32,
                    vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits10, vpx_highbd_sad32x32x8_bits10,
                    vpx_highbd_sad32x32x4d_bits10)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
                    vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
                    vpx_highbd_10_sub_pixel_variance64x64,
                    vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits10, vpx_highbd_sad64x64x8_bits10,
                    vpx_highbd_sad64x64x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
                    vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
                    vpx_highbd_10_sub_pixel_variance16x16,
                    vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits10, vpx_highbd_sad16x16x8_bits10,
                    vpx_highbd_sad16x16x4d_bits10)
 
         HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
                    vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
                    vpx_highbd_10_sub_pixel_variance16x8,
                    vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits10, vpx_highbd_sad16x8x8_bits10,
                    vpx_highbd_sad16x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
                    vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
                    vpx_highbd_10_sub_pixel_variance8x16,
                    vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits10, vpx_highbd_sad8x16x8_bits10,
                    vpx_highbd_sad8x16x4d_bits10)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad8x8_avg_bits10,
-            vpx_highbd_10_variance8x8, vpx_highbd_10_sub_pixel_variance8x8,
-            vpx_highbd_10_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits10,
-            vpx_highbd_sad8x8x8_bits10, vpx_highbd_sad8x8x4d_bits10)
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
+                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits10)
 
         HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
                    vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
                    vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits10, vpx_highbd_sad8x4x4d_bits10)
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits10)
 
         HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
                    vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
                    vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits10, vpx_highbd_sad4x8x4d_bits10)
-
-        HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad4x4_avg_bits10,
-            vpx_highbd_10_variance4x4, vpx_highbd_10_sub_pixel_variance4x4,
-            vpx_highbd_10_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits10,
-            vpx_highbd_sad4x4x8_bits10, vpx_highbd_sad4x4x4d_bits10)
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
+                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits10)
         break;
 
       case VPX_BITS_12:
         HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
                    vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
                    vpx_highbd_sad32x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
                    vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
                    vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
                    vpx_highbd_sad16x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
                    vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
                    vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
                    vpx_highbd_sad64x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
                    vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
                    vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
                    vpx_highbd_sad32x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
                    vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
                    vpx_highbd_12_sub_pixel_variance32x32,
                    vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x3_bits12, vpx_highbd_sad32x32x8_bits12,
                    vpx_highbd_sad32x32x4d_bits12)
 
         HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
                    vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
                    vpx_highbd_12_sub_pixel_variance64x64,
                    vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x3_bits12, vpx_highbd_sad64x64x8_bits12,
                    vpx_highbd_sad64x64x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
                    vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
                    vpx_highbd_12_sub_pixel_variance16x16,
                    vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x3_bits12, vpx_highbd_sad16x16x8_bits12,
                    vpx_highbd_sad16x16x4d_bits12)
 
         HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
                    vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
                    vpx_highbd_12_sub_pixel_variance16x8,
                    vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x3_bits12, vpx_highbd_sad16x8x8_bits12,
                    vpx_highbd_sad16x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
                    vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
                    vpx_highbd_12_sub_pixel_variance8x16,
                    vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x3_bits12, vpx_highbd_sad8x16x8_bits12,
                    vpx_highbd_sad8x16x4d_bits12)
 
-        HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad8x8_avg_bits12,
-            vpx_highbd_12_variance8x8, vpx_highbd_12_sub_pixel_variance8x8,
-            vpx_highbd_12_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x3_bits12,
-            vpx_highbd_sad8x8x8_bits12, vpx_highbd_sad8x8x4d_bits12)
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
+                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits12)
 
         HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
                    vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
                    vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4, NULL,
-                   vpx_highbd_sad8x4x8_bits12, vpx_highbd_sad8x4x4d_bits12)
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits12)
 
         HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
                    vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
                    vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8, NULL,
-                   vpx_highbd_sad4x8x8_bits12, vpx_highbd_sad4x8x4d_bits12)
-
-        HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad4x4_avg_bits12,
-            vpx_highbd_12_variance4x4, vpx_highbd_12_sub_pixel_variance4x4,
-            vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x3_bits12,
-            vpx_highbd_sad4x4x8_bits12, vpx_highbd_sad4x4x4d_bits12)
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
+                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits12)
         break;
 
       default:
@@ -1902,6 +1853,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
            cm->mi_rows * cm->mi_cols * sizeof(*cpi->consec_zero_mv));
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_reset_resize(cpi);
+    rc->rc_1_frame = 0;
+    rc->rc_2_frame = 0;
   }
 
   if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
@@ -1912,6 +1865,24 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
                                            (int)cpi->oxcf.target_bandwidth);
   }
 
+  // Check for resetting the rc flags (rc_1_frame, rc_2_frame) if the
+  // configuration change has a large change in avg_frame_bandwidth.
+  // For SVC check for resetting based on spatial layer average bandwidth.
+  // Also reset buffer level to optimal level.
+  if (cm->current_video_frame > 0) {
+    if (cpi->use_svc) {
+      vp9_svc_check_reset_layer_rc_flag(cpi);
+    } else {
+      if (rc->avg_frame_bandwidth > (3 * rc->last_avg_frame_bandwidth >> 1) ||
+          rc->avg_frame_bandwidth < (rc->last_avg_frame_bandwidth >> 1)) {
+        rc->rc_1_frame = 0;
+        rc->rc_2_frame = 0;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->buffer_level = rc->optimal_buffer_level;
+      }
+    }
+  }
+
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
@@ -2046,6 +2017,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   realloc_segmentation_maps(cpi);
 
+  CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                                sizeof(cpi->skin_map[0])));
+
   CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 
   CHECK_MEM_ERROR(
@@ -2162,7 +2136,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
-  yuv_skinmap_file = fopen("skinmap.yuv", "ab");
+  yuv_skinmap_file = fopen("skinmap.yuv", "wb");
 #endif
 #ifdef OUTPUT_YUV_REC
   yuv_rec_file = fopen("rec.yuv", "wb");
@@ -2175,6 +2149,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
 
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
 
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1) {
     vp9_init_first_pass(cpi);
   } else if (oxcf->pass == 2) {
@@ -2239,6 +2214,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
       vp9_init_second_pass(cpi);
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   vp9_set_speed_features_framesize_independent(cpi);
   vp9_set_speed_features_framesize_dependent(cpi);
@@ -2248,67 +2224,61 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                                    \
-  cpi->fn_ptr[BT].sdaf = SDAF;                                  \
-  cpi->fn_ptr[BT].vf = VF;                                      \
-  cpi->fn_ptr[BT].svf = SVF;                                    \
-  cpi->fn_ptr[BT].svaf = SVAF;                                  \
-  cpi->fn_ptr[BT].sdx3f = SDX3F;                                \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;                                \
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                      \
+  cpi->fn_ptr[BT].sdaf = SDAF;                    \
+  cpi->fn_ptr[BT].vf = VF;                        \
+  cpi->fn_ptr[BT].svf = SVF;                      \
+  cpi->fn_ptr[BT].svaf = SVAF;                    \
   cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16, NULL, NULL,
+      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
       vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32, NULL, NULL,
+      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
       vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32, NULL, NULL,
+      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
       vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64, NULL, NULL,
+      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
       vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
       vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x3, vpx_sad32x32x8, vpx_sad32x32x4d)
+      vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
       vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x3, vpx_sad64x64x8, vpx_sad64x64x4d)
+      vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
       vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x3, vpx_sad16x16x8, vpx_sad16x16x4d)
+      vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x3,
-      vpx_sad16x8x8, vpx_sad16x8x4d)
+      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
+      vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x3,
-      vpx_sad8x16x8, vpx_sad8x16x4d)
+      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
+      vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x3,
-      vpx_sad8x8x8, vpx_sad8x8x4d)
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, NULL,
-      vpx_sad8x4x8, vpx_sad8x4x4d)
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, NULL,
-      vpx_sad4x8x8, vpx_sad4x8x4d)
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x3,
-      vpx_sad4x4x8, vpx_sad4x4x4d)
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -2375,16 +2345,20 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
-                 "WstPsnr\tWstSsim\tWstFast\tWstHVS");
+                 "WstPsnr\tWstSsim\tWstFast\tWstHVS\t"
+                 "AVPsnrY\tAPsnrCb\tAPsnrCr");
         snprintf(results, sizeof(results),
                  "%7.2f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
-                 "%7.3f\t%7.3f\t%7.3f\t%7.3f",
+                 "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
+                 "%7.3f\t%7.3f\t%7.3f",
                  dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
                  cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr, total_ssim,
                  totalp_ssim, cpi->fastssim.stat[ALL] / cpi->count,
                  cpi->psnrhvs.stat[ALL] / cpi->count, cpi->psnr.worst,
-                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst);
+                 cpi->worst_ssim, cpi->fastssim.worst, cpi->psnrhvs.worst,
+                 cpi->psnr.stat[Y] / cpi->count, cpi->psnr.stat[U] / cpi->count,
+                 cpi->psnr.stat[V] / cpi->count);
 
         if (cpi->b_calculate_blockiness) {
           SNPRINT(headings, "\t  Block\tWstBlck");
@@ -2557,7 +2531,7 @@ int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                            YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(cfg, sd);
+    vpx_yv12_copy_frame(cfg, sd);
     return 0;
   } else {
     return -1;
@@ -2568,7 +2542,7 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
                           YV12_BUFFER_CONFIG *sd) {
   YV12_BUFFER_CONFIG *cfg = get_vp9_ref_frame_buffer(cpi, ref_frame_flag);
   if (cfg) {
-    vp8_yv12_copy_frame(sd, cfg);
+    vpx_yv12_copy_frame(sd, cfg);
     return 0;
   } else {
     return -1;
@@ -2581,38 +2555,6 @@ int vp9_update_entropy(VP9_COMP *cpi, int update) {
   return 0;
 }
 
-#if defined(OUTPUT_YUV_DENOISED) || defined(OUTPUT_YUV_SKINMAP)
-// The denoiser buffer is allocated as a YUV 440 buffer. This function writes it
-// as YUV 420. We simply use the top-left pixels of the UV buffers, since we do
-// not denoise the UV channels at this time. If ever we implement UV channel
-// denoising we will have to modify this.
-void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f) {
-  uint8_t *src = s->y_buffer;
-  int h = s->y_height;
-
-  do {
-    fwrite(src, s->y_width, 1, f);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = s->uv_height;
-
-  do {
-    fwrite(src, s->uv_width, 1, f);
-    src += s->uv_stride;
-  } while (--h);
-}
-#endif
-
 #ifdef OUTPUT_YUV_REC
 void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
   YV12_BUFFER_CONFIG *s = cm->frame_to_show;
@@ -2748,15 +2690,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
 
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
           vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride,
-                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride,
-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
-                               16 / factor, 16 / factor, bd);
+                               CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel,
+                               x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                               16 * src_h / dst_h, 16 / factor, 16 / factor,
+                               bd);
         } else {
-          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                        16 / factor);
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
         }
       }
     }
@@ -2782,11 +2723,33 @@ static int scale_down(VP9_COMP *cpi, int q) {
   return scale;
 }
 
-static int big_rate_miss(VP9_COMP *cpi, int high_limit, int low_limit) {
+static int big_rate_miss_high_threshold(VP9_COMP *cpi) {
   const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
 
-  return (rc->projected_frame_size > ((high_limit * 3) / 2)) ||
-         (rc->projected_frame_size < (low_limit / 2));
+  if (frame_is_kf_gf_arf(cpi))
+    big_miss_high = rc->this_frame_target * 3 / 2;
+  else
+    big_miss_high = rc->this_frame_target * 2;
+
+  return big_miss_high;
+}
+
+static int big_rate_miss(VP9_COMP *cpi) {
+  const RATE_CONTROL *const rc = &cpi->rc;
+  int big_miss_high;
+  int big_miss_low;
+
+  // Ignore for overlay frames
+  if (rc->is_src_frame_alt_ref) {
+    return 0;
+  } else {
+    big_miss_low = (rc->this_frame_target / 2);
+    big_miss_high = big_rate_miss_high_threshold(cpi);
+
+    return (rc->projected_frame_size > big_miss_high) ||
+           (rc->projected_frame_size < big_miss_low);
+  }
 }
 
 // test in two pass for the first
@@ -2811,8 +2774,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   int force_recode = 0;
 
   if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
-      big_rate_miss(cpi, high_limit, low_limit) ||
-      (cpi->sf.recode_loop == ALLOW_RECODE) ||
+      big_rate_miss(cpi) || (cpi->sf.recode_loop == ALLOW_RECODE) ||
       (two_pass_first_group_inter(cpi) &&
        (cpi->sf.recode_loop == ALLOW_RECODE_FIRST)) ||
       (frame_is_kfgfarf && (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF))) {
@@ -2822,8 +2784,13 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
       cpi->resize_pending = 1;
       return 1;
     }
-    // Force recode if projected_frame_size > max_frame_bandwidth
-    if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1;
+
+    // Force recode for extreme overshoot.
+    if ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+        (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF &&
+         rc->projected_frame_size >= big_rate_miss_high_threshold(cpi))) {
+      return 1;
+    }
 
     // TODO(agrange) high_limit could be greater than the scale-down threshold.
     if ((rc->projected_frame_size > high_limit && q < maxq) ||
@@ -2914,17 +2881,38 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
   if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
       cpi->denoiser.denoising_level > kDenLowLow) {
     int svc_base_is_key = 0;
+    int denoise_svc_second_layer = 0;
     if (cpi->use_svc) {
+      int realloc_fail = 0;
+      const int svc_buf_shift =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2
+              ? cpi->denoiser.num_ref_frames
+              : 0;
       int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id,
                                    cpi->svc.temporal_layer_id,
                                    cpi->svc.number_temporal_layers);
       LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
       svc_base_is_key = lc->is_key_frame;
+      denoise_svc_second_layer =
+          cpi->svc.number_spatial_layers - cpi->svc.spatial_layer_id == 2 ? 1
+                                                                          : 0;
+      // Check if we need to allocate extra buffers in the denoiser
+      // for
+      // refreshed frames.
+      realloc_fail = vp9_denoiser_realloc_svc(
+          cm, &cpi->denoiser, svc_buf_shift, cpi->refresh_alt_ref_frame,
+          cpi->refresh_golden_frame, cpi->refresh_last_frame, cpi->alt_fb_idx,
+          cpi->gld_fb_idx, cpi->lst_fb_idx);
+      if (realloc_fail)
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to re-allocate denoiser for SVC");
     }
     vp9_denoiser_update_frame_info(
         &cpi->denoiser, *cpi->Source, cpi->common.frame_type,
         cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame,
-        cpi->refresh_last_frame, cpi->resize_pending, svc_base_is_key);
+        cpi->refresh_last_frame, cpi->alt_fb_idx, cpi->gld_fb_idx,
+        cpi->lst_fb_idx, cpi->resize_pending, svc_base_is_key,
+        denoise_svc_second_layer);
   }
 #endif
   if (is_one_pass_cbr_svc(cpi)) {
@@ -3195,15 +3183,37 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
     dc_quant_devisor = 4.0;
 #endif
 
-    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
-       "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" "
-       "%10"PRId64" %10"PRId64" %10d "
-       "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
-        "%6d %6d %5d %5d %5d "
-        "%10"PRId64" %10.3lf"
-        "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n",
+    if (!cm->current_video_frame) {
+      fprintf(f, "frame, width, height, last ts, last end ts, "
+          "source_alt_ref_pending, source_alt_ref_active, "
+          "this_frame_target, projected_frame_size, "
+          "projected_frame_size / MBs, "
+          "projected_frame_size - this_frame_target, "
+          "vbr_bits_off_target, vbr_bits_off_target_fast, "
+          "twopass.extend_minq, twopass.extend_minq_fast, "
+          "total_target_vs_actual, "
+          "starting_buffer_level - bits_off_target, "
+          "total_actual_bits, base_qindex, q for base_qindex, "
+          "dc quant, q for active_worst_quality, avg_q, q for oxcf.cq_level, "
+          "refresh_last_frame, refresh_golden_frame, refresh_alt_ref_frame, "
+          "frame_type, gfu_boost, "
+          "twopass.bits_left, "
+          "twopass.total_left_stats.coded_error, "
+          "twopass.bits_left / (1 + twopass.total_left_stats.coded_error), "
+          "tot_recode_hits, recon_err, kf_boost, "
+          "twopass.kf_zeromotion_pct, twopass.fr_content_type, "
+          "filter_level, seg.aq_av_offset\n");
+    }
+
+    fprintf(f, "%10u, %d, %d, %10"PRId64", %10"PRId64", %d, %d, %10d, %10d, "
+        "%10d, %10d, %10"PRId64", %10"PRId64", %5d, %5d, %10"PRId64", "
+        "%10"PRId64", %10"PRId64", %10d, %7.2lf, %7.2lf, %7.2lf, %7.2lf, "
+        "%7.2lf, %6d, %6d, %5d, %5d, %5d, %10"PRId64", %10.3lf, %10lf, %8u, "
+        "%10"PRId64", %10d, %10d, %10d, %10d, %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->last_time_stamp_seen,
+        cpi->last_end_time_stamp_seen,
         cpi->rc.source_alt_ref_pending,
         cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
@@ -3291,7 +3301,6 @@ static void set_size_independent_vars(VP9_COMP *cpi) {
 static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
                                     int *top_index) {
   VP9_COMMON *const cm = &cpi->common;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
   // Setup variables that depend on the dimensions of the frame.
   vp9_set_speed_features_framesize_dependent(cpi);
@@ -3303,17 +3312,19 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
     vp9_set_high_precision_mv(cpi, (*q) < HIGH_PRECISION_MV_QTHRESH);
   }
 
+#if !CONFIG_REALTIME_ONLY
   // Configure experimental use of segmentation for enhanced coding of
   // static regions if indicated.
   // Only allowed in the second pass of a two pass encode, as it requires
   // lagged coding, and if the relevant speed feature flag is set.
-  if (oxcf->pass == 2 && cpi->sf.static_segmentation)
+  if (cpi->oxcf.pass == 2 && cpi->sf.static_segmentation)
     configure_static_seg_features(cpi);
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if CONFIG_VP9_POSTPROC && !(CONFIG_VP9_TEMPORAL_DENOISING)
-  if (oxcf->noise_sensitivity > 0) {
+  if (cpi->oxcf.noise_sensitivity > 0) {
     int l = 0;
-    switch (oxcf->noise_sensitivity) {
+    switch (cpi->oxcf.noise_sensitivity) {
       case 1: l = 20; break;
       case 2: l = 40; break;
       case 3: l = 60; break;
@@ -3336,7 +3347,8 @@ static void setup_denoiser_buffer(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->oxcf.noise_sensitivity > 0 &&
       !cpi->denoiser.frame_buffer_initialized) {
-    if (vp9_denoiser_alloc(&cpi->denoiser, cm->width, cm->height,
+    if (vp9_denoiser_alloc(cm, &cpi->svc, &cpi->denoiser, cpi->use_svc,
+                           cpi->oxcf.noise_sensitivity, cm->width, cm->height,
                            cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
                            cm->use_highbitdepth,
@@ -3364,6 +3376,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
 
+#if !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 2 && oxcf->rc_mode == VPX_VBR &&
       ((oxcf->resize_mode == RESIZE_FIXED && cm->current_video_frame == 0) ||
        (oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending))) {
@@ -3374,6 +3387,7 @@ static void set_frame_size(VP9_COMP *cpi) {
     vp9_set_size_literal(cpi, oxcf->scaled_frame_width,
                          oxcf->scaled_frame_height);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   if (oxcf->pass == 0 && oxcf->rc_mode == VPX_CBR && !cpi->use_svc &&
       oxcf->resize_mode == RESIZE_DYNAMIC && cpi->resize_pending != 0) {
@@ -3466,8 +3480,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   // Flag to check if its valid to compute the source sad (used for
   // scene detection and for superblock content state in CBR mode).
   // The flag may get reset below based on SVC or resizing state.
-  cpi->compute_source_sad_onepass =
-      cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cm->show_frame;
+  cpi->compute_source_sad_onepass = cpi->oxcf.mode == REALTIME;
 
   vpx_clear_system_state();
 
@@ -3518,6 +3531,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   if ((cpi->use_svc &&
        (cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1 ||
+        cpi->svc.temporal_layer_id < cpi->svc.number_temporal_layers - 1 ||
         cpi->svc.current_superframe < 1)) ||
       cpi->resize_pending || cpi->resize_state || cpi->external_resize ||
       cpi->resize_state != ORIG) {
@@ -3555,12 +3569,14 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   vp9_update_noise_estimate(cpi);
 
-  // Scene detection is used for VBR mode or screen-content case.
-  // Make sure compute_source_sad_onepass is set (which handles SVC case
-  // and dynamic resize).
-  if (cpi->compute_source_sad_onepass &&
+  // Scene detection is always used for VBR mode or screen-content case.
+  // For other cases (e.g., CBR mode) use it for 5 <= speed < 8 for now
+  // (need to check encoding time cost for doing this for speed 8).
+  cpi->rc.high_source_sad = 0;
+  if (cpi->compute_source_sad_onepass && cm->show_frame &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
-       cpi->oxcf.content == VP9E_CONTENT_SCREEN))
+       cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
+       (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8 && !cpi->use_svc)))
     vp9_scene_detection_onepass(cpi);
 
   // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
@@ -3576,6 +3592,16 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   if (cpi->sf.copy_partition_flag) alloc_copy_partition_data(cpi);
 
+  if (cpi->sf.svc_use_lowres_part &&
+      cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 2) {
+    if (cpi->svc.prev_partition_svc == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->svc.prev_partition_svc,
+          (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
+                                   sizeof(*cpi->svc.prev_partition_svc)));
+    }
+  }
+
   if (cpi->oxcf.speed >= 5 && cpi->oxcf.pass == 0 &&
       cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
@@ -3660,6 +3686,7 @@ static int get_qstep_adj(int rate_excess, int rate_limit) {
 
 static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
                                     uint8_t *dest) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int bottom_index, top_index;
@@ -3696,9 +3723,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         qrange_adj = VPXMAX(1, (top_index - bottom_index) / 2);
 
         bottom_index =
-            VPXMAX(bottom_index - qrange_adj / 2, cpi->oxcf.best_allowed_q);
-        top_index =
-            VPXMIN(cpi->oxcf.worst_allowed_q, top_index + qrange_adj / 2);
+            VPXMAX(bottom_index - qrange_adj / 2, oxcf->best_allowed_q);
+        top_index = VPXMIN(oxcf->worst_allowed_q, top_index + qrange_adj / 2);
       }
 #endif
       // TODO(agrange) Scale cpi->max_mv_magnitude if frame-size has changed.
@@ -3726,7 +3752,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 
     cpi->Source =
         vp9_scale_if_required(cm, cpi->un_scaled_source, &cpi->scaled_source,
-                              (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+                              (oxcf->pass == 0), EIGHTTAP, 0);
 
     // Unfiltered raw source used in metrics calculation if the source
     // has been filtered.
@@ -3735,7 +3761,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (is_spatial_denoise_enabled(cpi)) {
         cpi->raw_source_frame = vp9_scale_if_required(
             cm, &cpi->raw_unscaled_source, &cpi->raw_scaled_source,
-            (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+            (oxcf->pass == 0), EIGHTTAP, 0);
       } else {
         cpi->raw_source_frame = cpi->Source;
       }
@@ -3745,9 +3771,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
     }
 
     if (cpi->unscaled_last_source != NULL)
-      cpi->Last_Source = vp9_scale_if_required(
-          cm, cpi->unscaled_last_source, &cpi->scaled_last_source,
-          (cpi->oxcf.pass == 0), EIGHTTAP, 0);
+      cpi->Last_Source = vp9_scale_if_required(cm, cpi->unscaled_last_source,
+                                               &cpi->scaled_last_source,
+                                               (oxcf->pass == 0), EIGHTTAP, 0);
 
     if (frame_is_intra_only(cm) == 0) {
       if (loop_count > 0) {
@@ -3762,13 +3788,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
-    if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
+    if (oxcf->aq_mode == VARIANCE_AQ) {
       vp9_vaq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == EQUATOR360_AQ) {
+    } else if (oxcf->aq_mode == EQUATOR360_AQ) {
       vp9_360aq_frame_setup(cpi);
-    } else if (cpi->oxcf.aq_mode == COMPLEXITY_AQ) {
+    } else if (oxcf->aq_mode == COMPLEXITY_AQ) {
       vp9_setup_in_frame_q_adj(cpi);
-    } else if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) {
+    } else if (oxcf->aq_mode == LOOKAHEAD_AQ) {
       vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
     }
 
@@ -3792,7 +3818,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->oxcf.rc_mode == VPX_Q) {
+    if (oxcf->rc_mode == VPX_Q) {
       loop = 0;
     } else {
       if ((cm->frame_type == KEY_FRAME) && rc->this_key_frame_forced &&
@@ -3872,11 +3898,16 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
         // Frame is too large
         if (rc->projected_frame_size > rc->this_frame_target) {
           // Special case if the projected size is > the max allowed.
-          if (rc->projected_frame_size >= rc->max_frame_bandwidth) {
+          if ((q == q_high) &&
+              ((rc->projected_frame_size >= rc->max_frame_bandwidth) ||
+               (rc->projected_frame_size >=
+                big_rate_miss_high_threshold(cpi)))) {
+            int max_rate = VPXMAX(1, VPXMIN(rc->max_frame_bandwidth,
+                                            big_rate_miss_high_threshold(cpi)));
             double q_val_high;
             q_val_high = vp9_convert_qindex_to_q(q_high, cm->bit_depth);
-            q_val_high = q_val_high * ((double)rc->projected_frame_size /
-                                       rc->max_frame_bandwidth);
+            q_val_high =
+                q_val_high * ((double)rc->projected_frame_size / max_rate);
             q_high = vp9_convert_q_to_qindex(q_val_high, cm->bit_depth);
             q_high = clamp(q_high, rc->best_quality, rc->worst_quality);
           }
@@ -3885,7 +3916,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           qstep =
               get_qstep_adj(rc->projected_frame_size, rc->this_frame_target);
           q_low = VPXMIN(q + qstep, q_high);
-          // q_low = q < q_high ? q + 1 : q_high;
 
           if (undershoot_seen || loop_at_this_size > 1) {
             // Update rate_correction_factor unless
@@ -3913,31 +3943,29 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
           qstep =
               get_qstep_adj(rc->this_frame_target, rc->projected_frame_size);
           q_high = VPXMAX(q - qstep, q_low);
-          // q_high = q > q_low ? q - 1 : q_low;
 
           if (overshoot_seen || loop_at_this_size > 1) {
             vp9_rc_update_rate_correction_factors(cpi);
             q = (q_high + q_low) / 2;
           } else {
             vp9_rc_update_rate_correction_factors(cpi);
-            q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                  top_index);
+            q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                  VPXMIN(q_low, bottom_index), top_index);
             // Special case reset for qlow for constrained quality.
             // This should only trigger where there is very substantial
             // undershoot on a frame and the auto cq level is above
             // the user passsed in value.
-            if (cpi->oxcf.rc_mode == VPX_CQ && q < q_low) {
+            if (oxcf->rc_mode == VPX_CQ && q < q_low) {
               q_low = q;
             }
 
             while (q > q_high && retries < 10) {
               vp9_rc_update_rate_correction_factors(cpi);
-              q = vp9_rc_regulate_q(cpi, rc->this_frame_target, bottom_index,
-                                    top_index);
+              q = vp9_rc_regulate_q(cpi, rc->this_frame_target,
+                                    VPXMIN(q_low, bottom_index), top_index);
               retries++;
             }
           }
-
           undershoot_seen = 1;
         }
 
@@ -3971,9 +3999,21 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size,
 #ifdef AGGRESSIVE_VBR
   if (two_pass_first_group_inter(cpi)) {
     cpi->twopass.active_worst_quality =
-        VPXMIN(q + qrange_adj, cpi->oxcf.worst_allowed_q);
-  }
+        VPXMIN(q + qrange_adj, oxcf->worst_allowed_q);
+  } else if (!frame_is_kf_gf_arf(cpi)) {
+#else
+  if (!frame_is_kf_gf_arf(cpi)) {
 #endif
+    // Have we been forced to adapt Q outside the expected range by an extreme
+    // rate miss. If so adjust the active maxQ for the subsequent frames.
+    if (q > cpi->twopass.active_worst_quality) {
+      cpi->twopass.active_worst_quality = q;
+    } else if (oxcf->vbr_corpus_complexity && q == q_low &&
+               rc->projected_frame_size < rc->this_frame_target) {
+      cpi->twopass.active_worst_quality =
+          VPXMAX(q, cpi->twopass.active_worst_quality - 1);
+    }
+  }
 
   if (enable_acl) {
     // Skip recoding, if model diff is below threshold
@@ -4448,14 +4488,14 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #ifdef OUTPUT_YUV_DENOISED
   if (oxcf->noise_sensitivity > 0 && denoise_svc(cpi)) {
-    vp9_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
-                            yuv_denoised_file);
+    vpx_write_yuv_frame(yuv_denoised_file,
+                        &cpi->denoiser.running_avg_y[INTRA_FRAME]);
   }
 #endif
 #endif
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
-    vp9_compute_skin_map(cpi, yuv_skinmap_file);
+    vp9_output_skin_map(cpi, yuv_skinmap_file);
   }
 #endif
 
@@ -4592,6 +4632,7 @@ static void Pass0Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
@@ -4600,6 +4641,7 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
   if (!(is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING))
     vp9_twopass_postencode_update(cpi);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static void init_ref_frame_bufs(VP9_COMMON *cm) {
   int i;
@@ -4822,6 +4864,7 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   int i, idx;
   uint64_t luma_samples, dur_end;
   const uint32_t luma_pic_size = cm->width * cm->height;
+  const uint32_t luma_pic_breadth = VPXMAX(cm->width, cm->height);
   LevelConstraint *const level_constraint = &cpi->level_constraint;
   const int8_t level_index = level_constraint->level_index;
   double cpb_data_size;
@@ -4925,6 +4968,11 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
     level_spec->max_luma_picture_size = luma_pic_size;
   }
 
+  // update max_luma_picture_breadth
+  if (luma_pic_breadth > level_spec->max_luma_picture_breadth) {
+    level_spec->max_luma_picture_breadth = luma_pic_breadth;
+  }
+
   // update compression_ratio
   level_spec->compression_ratio = (double)level_stats->total_uncompressed_size *
                                   cm->bit_depth /
@@ -4945,6 +4993,15 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
                          level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]);
     }
 
+    if (level_spec->max_luma_picture_breadth >
+        vp9_level_defs[level_index].max_luma_picture_breadth) {
+      level_constraint->fail_flag |= (1 << LUMA_PIC_BREADTH_TOO_LARGE);
+      vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                         "Failed to encode to the target level %d. %s",
+                         vp9_level_defs[level_index].level,
+                         level_fail_messages[LUMA_PIC_BREADTH_TOO_LARGE]);
+    }
+
     if ((double)level_spec->max_luma_sample_rate >
         (double)vp9_level_defs[level_index].max_luma_sample_rate *
             (1 + SAMPLE_RATE_GRACE_P)) {
@@ -5094,7 +5151,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       }
       cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1;
 #endif
-
+#if !CONFIG_REALTIME_ONLY
       if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) &&
           (oxcf->arnr_strength > 0)) {
         int bitrate = cpi->rc.avg_frame_bandwidth / 40;
@@ -5114,7 +5171,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
         force_src_buffer = &cpi->alt_ref_buffer;
       }
-
+#endif
       cm->show_frame = 0;
       cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
@@ -5145,8 +5202,6 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       cm->intra_only = 0;
       // if the flags indicate intra frame, but if the current picture is for
       // non-zero spatial layer, it should not be an intra picture.
-      // TODO(Won Kap): this needs to change if per-layer intra frame is
-      // allowed.
       if ((source->flags & VPX_EFLAG_FORCE_KF) &&
           cpi->svc.spatial_layer_id > cpi->svc.first_spatial_layer_to_encode) {
         source->flags &= ~(unsigned int)(VPX_EFLAG_FORCE_KF);
@@ -5175,10 +5230,12 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   } else {
     *size = 0;
+#if !CONFIG_REALTIME_ONLY
     if (flush && oxcf->pass == 1 && !cpi->twopass.first_pass_done) {
       vp9_end_first_pass(cpi); /* get last stats packet */
       cpi->twopass.first_pass_done = 1;
     }
+#endif  // !CONFIG_REALTIME_ONLY
     return -1;
   }
 
@@ -5225,6 +5282,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
   cpi->frame_flags = *frame_flags;
 
+#if !CONFIG_REALTIME_ONLY
   if ((oxcf->pass == 2) &&
       (!cpi->use_svc || (is_two_pass_svc(cpi) &&
                          cpi->svc.encode_empty_frame_state != ENCODING))) {
@@ -5232,6 +5290,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
   if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 &&
       cpi->level_constraint.fail_flag == 0)
@@ -5242,20 +5301,28 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   cpi->td.mb.fp_src_pred = 0;
+#if CONFIG_REALTIME_ONLY
+  if (cpi->use_svc) {
+    SvcEncode(cpi, size, dest, frame_flags);
+  } else {
+    // One pass encode
+    Pass0Encode(cpi, size, dest, frame_flags);
+  }
+#else  // !CONFIG_REALTIME_ONLY
   if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cpi->oxcf.use_highbitdepth)
-      cpi->td.mb.fwd_txm4x4 =
+      cpi->td.mb.fwd_txfm4x4 =
           lossless ? vp9_highbd_fwht4x4 : vpx_highbd_fdct4x4;
     else
-      cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
-    cpi->td.mb.highbd_itxm_add =
+      cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.highbd_inv_txfm_add =
         lossless ? vp9_highbd_iwht4x4_add : vp9_highbd_idct4x4_add;
 #else
-    cpi->td.mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
+    cpi->td.mb.fwd_txfm4x4 = lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    cpi->td.mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
+    cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
     Pass2Encode(cpi, size, dest, frame_flags);
@@ -5265,6 +5332,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     // One pass encode
     Pass0Encode(cpi, size, dest, frame_flags);
   }
+#endif  // CONFIG_REALTIME_ONLY
 
   if (cm->refresh_frame_context)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
@@ -5631,7 +5699,7 @@ void vp9_set_row_mt(VP9_COMP *cpi) {
     cpi->row_mt = 1;
   }
 
-  if (cpi->row_mt && cpi->oxcf.max_threads > 1)
+  if (cpi->row_mt)
     cpi->row_mt_bit_exact = 1;
   else
     cpi->row_mt_bit_exact = 0;
diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index 672c83bfd..d723d93cb 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h
@@ -138,6 +138,7 @@ typedef enum {
   kHighSadLowSumdiff = 3,
   kHighSadHighSumdiff = 4,
   kLowVarHighSumdiff = 5,
+  kVeryHighSad = 6,
 } CONTENT_STATE_SB;
 
 typedef struct VP9EncoderConfig {
@@ -208,6 +209,7 @@ typedef struct VP9EncoderConfig {
   int two_pass_vbrbias;  // two pass datarate control tweaks
   int two_pass_vbrmin_section;
   int two_pass_vbrmax_section;
+  int vbr_corpus_complexity;  // 0 indicates corpus vbr disabled
   // END DATARATE CONTROL OPTIONS
   // ----------------------------------------------------------------
 
@@ -359,6 +361,7 @@ typedef struct IMAGE_STAT {
 
 typedef enum {
   LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
   LEVEL_1 = 10,
   LEVEL_1_1 = 11,
   LEVEL_2 = 20,
@@ -380,6 +383,7 @@ typedef struct {
   VP9_LEVEL level;
   uint64_t max_luma_sample_rate;
   uint32_t max_luma_picture_size;
+  uint32_t max_luma_picture_breadth;
   double average_bitrate;  // in kilobits per second
   double max_cpb_size;     // in kilobits
   double compression_ratio;
@@ -419,14 +423,15 @@ typedef struct {
 
 typedef enum {
   BITRATE_TOO_LARGE = 0,
-  LUMA_PIC_SIZE_TOO_LARGE = 1,
-  LUMA_SAMPLE_RATE_TOO_LARGE = 2,
-  CPB_TOO_LARGE = 3,
-  COMPRESSION_RATIO_TOO_SMALL = 4,
-  TOO_MANY_COLUMN_TILE = 5,
-  ALTREF_DIST_TOO_SMALL = 6,
-  TOO_MANY_REF_BUFFER = 7,
-  TARGET_LEVEL_FAIL_IDS = 8
+  LUMA_PIC_SIZE_TOO_LARGE,
+  LUMA_PIC_BREADTH_TOO_LARGE,
+  LUMA_SAMPLE_RATE_TOO_LARGE,
+  CPB_TOO_LARGE,
+  COMPRESSION_RATIO_TOO_SMALL,
+  TOO_MANY_COLUMN_TILE,
+  ALTREF_DIST_TOO_SMALL,
+  TOO_MANY_REF_BUFFER,
+  TARGET_LEVEL_FAIL_IDS
 } TARGET_LEVEL_FAIL_ID;
 
 typedef struct {
@@ -541,6 +546,8 @@ typedef struct VP9_COMP {
 
   uint8_t *segmentation_map;
 
+  uint8_t *skin_map;
+
   // segment threashold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
@@ -548,7 +555,6 @@ typedef struct VP9_COMP {
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp9_full_search_fn_t full_search_sad;
   vp9_diamond_search_fn_t diamond_search_sad;
   vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
@@ -714,6 +720,9 @@ typedef struct VP9_COMP {
   int compute_source_sad_onepass;
 
   LevelConstraint level_constraint;
+
+  uint8_t *count_arf_frame_usage;
+  uint8_t *count_lastgolden_frame_usage;
 } VP9_COMP;
 
 void vp9_initialize_enc(void);
@@ -861,13 +870,14 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
 static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
   return (!cpi->use_svc ||
           (cpi->use_svc &&
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+           cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
 }
 #endif
 
+#define MIN_LOOKAHEAD_FOR_ARFS 4
 static INLINE int is_altref_enabled(const VP9_COMP *const cpi) {
   return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) &&
-         cpi->oxcf.lag_in_frames > 0 &&
+         cpi->oxcf.lag_in_frames >= MIN_LOOKAHEAD_FOR_ARFS &&
          (cpi->oxcf.enable_auto_arf &&
           (!is_two_pass_svc(cpi) ||
            cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id]));
@@ -910,6 +920,22 @@ static INLINE int get_level_index(VP9_LEVEL level) {
   return -1;
 }
 
+// Return the log2 value of max column tiles corresponding to the level that
+// the picture size fits into.
+static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
+                                                   uint32_t height) {
+  int i;
+  const uint32_t pic_size = width * height;
+  const uint32_t pic_breadth = VPXMAX(width, height);
+  for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+    if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+        vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+      return get_msb(vp9_level_defs[i].max_col_tiles);
+    }
+  }
+  return INT_MAX;
+}
+
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c
index 51664112a..0bd2e2145 100644
--- a/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libvpx/vp9/encoder/vp9_ethread.c
@@ -35,7 +35,8 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
                   td_t->rd_counts.coef_counts[i][j][k][l][m][n];
 }
 
-static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
+static int enc_worker_hook(void *arg1, void *unused) {
+  EncWorkerData *const thread_data = (EncWorkerData *)arg1;
   VP9_COMP *const cpi = thread_data->cpi;
   const VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
@@ -64,6 +65,13 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
   vp9_get_tile_n_bits(mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
   log2_tile_cols =
       clamp(cpi->oxcf.tile_columns, min_log2_tile_cols, max_log2_tile_cols);
+  if (cpi->oxcf.target_level == LEVEL_AUTO) {
+    const int level_tile_cols =
+        log_tile_cols_from_picsize_level(cpi->common.width, cpi->common.height);
+    if (log2_tile_cols > level_tile_cols) {
+      log2_tile_cols = VPXMAX(level_tile_cols, min_log2_tile_cols);
+    }
+  }
   return (1 << log2_tile_cols);
 }
 
@@ -135,7 +143,7 @@ static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
-    worker->hook = (VPxWorkerHook)hook;
+    worker->hook = hook;
     worker->data1 = &cpi->tile_thr_data[i];
     worker->data2 = data2;
   }
@@ -203,7 +211,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     }
   }
 
-  launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);
+  launch_enc_workers(cpi, enc_worker_hook, NULL, num_workers);
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
@@ -217,6 +225,7 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
                                     TileDataEnc *tile_data_t) {
   tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
@@ -251,6 +260,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
           : VPXMIN(tile_data->fp_data.image_data_start_row,
                    tile_data_t->fp_data.image_data_start_row);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 // Allocate memory for row synchronization
 void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
@@ -379,6 +389,7 @@ void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
   return;
 }
 
+#if !CONFIG_REALTIME_ONLY
 static int first_pass_worker_hook(EncWorkerData *const thread_data,
                                   MultiThreadHandle *multi_thread_ctxt) {
   VP9_COMP *const cpi = thread_data->cpi;
@@ -545,6 +556,7 @@ void vp9_temporal_filter_row_mt(VP9_COMP *cpi) {
   launch_enc_workers(cpi, (VPxWorkerHook)temporal_filter_worker_hook,
                      multi_thread_ctxt, num_workers);
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 static int enc_row_mt_worker_hook(EncWorkerData *const thread_data,
                                   MultiThreadHandle *multi_thread_ctxt) {
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index b6e327548..fb6b132a5 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -41,9 +41,9 @@
 
 #define OUTPUT_FPF 0
 #define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
 
 #define FIRST_PASS_Q 10.0
-#define GF_MAX_BOOST 96.0
 #define INTRA_MODE_PENALTY 1024
 #define MIN_ARF_GF_BOOST 240
 #define MIN_DECAY_FACTOR 0.01
@@ -103,7 +103,7 @@ static void output_stats(FIRSTPASS_STATS *stats,
     fpfile = fopen("firstpass.stt", "a");
 
     fprintf(fpfile,
-            "%12.0lf %12.4lf %12.0lf %12.0lf %12.0lf %12.0lf %12.4lf %12.4lf"
+            "%12.0lf %12.4lf %12.2lf %12.2lf %12.2lf %12.0lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.4lf"
             "%12.4lf %12.4lf %12.4lf %12.4lf %12.4lf %12.0lf %12.0lf %12.0lf"
             "%12.4lf"
@@ -235,16 +235,25 @@ static double calculate_active_area(const VP9_COMP *cpi,
   return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
 }
 
+// Get the average weighted error for the clip (or corpus)
+static double get_distribution_av_err(VP9_COMP *cpi, TWO_PASS *const twopass) {
+  const double av_weight =
+      twopass->total_stats.weight / twopass->total_stats.count;
+
+  if (cpi->oxcf.vbr_corpus_complexity)
+    return av_weight * twopass->mean_mod_score;
+  else
+    return (twopass->total_stats.coded_error * av_weight) /
+           twopass->total_stats.count;
+}
+
+#define ACT_AREA_CORRECTION 0.5
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-#define ACT_AREA_CORRECTION 0.5
 static double calculate_mod_frame_score(const VP9_COMP *cpi,
-                                        const TWO_PASS *twopass,
                                         const VP9EncoderConfig *oxcf,
-                                        const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
+                                        const FIRSTPASS_STATS *this_frame,
+                                        const double av_err) {
   double modified_score =
       av_err * pow(this_frame->coded_error * this_frame->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
@@ -260,13 +269,12 @@ static double calculate_mod_frame_score(const VP9_COMP *cpi,
 
   return modified_score;
 }
+
 static double calculate_norm_frame_score(const VP9_COMP *cpi,
                                          const TWO_PASS *twopass,
                                          const VP9EncoderConfig *oxcf,
-                                         const FIRSTPASS_STATS *this_frame) {
-  const FIRSTPASS_STATS *const stats = &twopass->total_stats;
-  const double av_weight = stats->weight / stats->count;
-  const double av_err = (stats->coded_error * av_weight) / stats->count;
+                                         const FIRSTPASS_STATS *this_frame,
+                                         const double av_err) {
   double modified_score =
       av_err * pow(this_frame->coded_error * this_frame->weight /
                        DOUBLE_DIVIDE_CHECK(av_err),
@@ -723,8 +731,9 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   // Exclude any image dead zone
   if (fp_acc_data->image_data_start_row > 0) {
     fp_acc_data->intra_skip_count =
-        VPXMAX(0, fp_acc_data->intra_skip_count -
-                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+        VPXMAX(0,
+               fp_acc_data->intra_skip_count -
+                   (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
   }
 
   fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -1583,6 +1592,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   const RATE_CONTROL *const rc = &cpi->rc;
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   TWO_PASS *const twopass = &cpi->twopass;
+  double last_group_rate_err;
 
   // Clamp the target rate to VBR min / max limts.
   const int target_rate =
@@ -1591,6 +1601,18 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
   noise_factor = fclamp(noise_factor, NOISE_FACTOR_MIN, NOISE_FACTOR_MAX);
   inactive_zone = fclamp(inactive_zone, 0.0, 1.0);
 
+// TODO(jimbankoski): remove #if here or below when this has been
+// well tested.
+#if CONFIG_ALWAYS_ADJUST_BPM
+  // based on recent history adjust expectations of bits per macroblock.
+  last_group_rate_err =
+      (double)twopass->rolling_arf_group_actual_bits /
+      DOUBLE_DIVIDE_CHECK((double)twopass->rolling_arf_group_target_bits);
+  last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
+  twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
+  twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
+
   if (target_rate <= 0) {
     return rc->worst_quality;  // Highest value allowed
   } else {
@@ -1601,11 +1623,13 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
     const int active_mbs = (int)VPXMAX(1, (double)num_mbs * active_pct);
     const double av_err_per_mb = section_err / active_pct;
     const double speed_term = 1.0 + 0.04 * oxcf->speed;
-    double last_group_rate_err;
     const int target_norm_bits_per_mb =
         (int)(((uint64_t)target_rate << BPER_MB_NORMBITS) / active_mbs);
     int q;
 
+// TODO(jimbankoski): remove #if here or above when this has been
+// well tested.
+#if !CONFIG_ALWAYS_ADJUST_BPM
     // based on recent history adjust expectations of bits per macroblock.
     last_group_rate_err =
         (double)twopass->rolling_arf_group_actual_bits /
@@ -1613,6 +1637,7 @@ static int get_twopass_worst_quality(VP9_COMP *cpi, const double section_err,
     last_group_rate_err = VPXMAX(0.25, VPXMIN(4.0, last_group_rate_err));
     twopass->bpm_factor *= (3.0 + last_group_rate_err) / 4.0;
     twopass->bpm_factor = VPXMAX(0.25, VPXMIN(4.0, twopass->bpm_factor));
+#endif
 
     // Try and pick a max Q that will be high enough to encode the
     // content at the given rate.
@@ -1666,7 +1691,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
 
 void vp9_init_second_pass(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
-  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int is_two_pass_svc =
       (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
   RATE_CONTROL *const rc = &cpi->rc;
@@ -1686,6 +1711,63 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   *stats = *twopass->stats_in_end;
   twopass->total_left_stats = *stats;
 
+  // Scan the first pass file and calculate a modified score for each
+  // frame that is used to distribute bits. The modified score is assumed
+  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // that is double that of frame B will be allocated 2x as many bits.
+  {
+    double modified_score_total = 0.0;
+    const FIRSTPASS_STATS *s = twopass->stats_in;
+    double av_err;
+
+    if (oxcf->vbr_corpus_complexity) {
+      twopass->mean_mod_score = (double)oxcf->vbr_corpus_complexity / 10.0;
+      av_err = get_distribution_av_err(cpi, twopass);
+    } else {
+      av_err = get_distribution_av_err(cpi, twopass);
+      // The first scan is unclamped and gives a raw average.
+      while (s < twopass->stats_in_end) {
+        modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
+        ++s;
+      }
+
+      // The average error from this first scan is used to define the midpoint
+      // error for the rate distribution function.
+      twopass->mean_mod_score =
+          modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+    }
+
+    // Second scan using clamps based on the previous cycle average.
+    // This may modify the total and average somewhat but we dont bother with
+    // further itterations.
+    modified_score_total = 0.0;
+    s = twopass->stats_in;
+    while (s < twopass->stats_in_end) {
+      modified_score_total +=
+          calculate_norm_frame_score(cpi, twopass, oxcf, s, av_err);
+      ++s;
+    }
+    twopass->normalized_score_left = modified_score_total;
+
+    // If using Corpus wide VBR mode then update the clip target bandwidth to
+    // reflect how the clip compares to the rest of the corpus.
+    if (oxcf->vbr_corpus_complexity) {
+      oxcf->target_bandwidth =
+          (int64_t)((double)oxcf->target_bandwidth *
+                    (twopass->normalized_score_left / stats->count));
+    }
+
+#if COMPLEXITY_STATS_OUTPUT
+    {
+      FILE *compstats;
+      compstats = fopen("complexity_stats.stt", "a");
+      fprintf(compstats, "%10.3lf\n",
+              twopass->normalized_score_left / stats->count);
+      fclose(compstats);
+    }
+#endif
+  }
+
   frame_rate = 10000000.0 * stats->count / stats->duration;
   // Each frame can have a different duration, as the frame rate in the source
   // isn't guaranteed to be constant. The frame rate prior to the first frame
@@ -1708,37 +1790,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
   // This variable monitors how far behind the second ref update is lagging.
   twopass->sr_update_lag = 1;
 
-  // Scan the first pass file and calculate a modified score for each
-  // frame that is used to distribute bits. The modified score is assumed
-  // to provide a linear basis for bit allocation. I.e a frame A with a score
-  // that is double that of frame B will be allocated 2x as many bits.
-  {
-    const FIRSTPASS_STATS *s = twopass->stats_in;
-    double modified_score_total = 0.0;
-
-    // The first scan is unclamped and gives a raw average.
-    while (s < twopass->stats_in_end) {
-      modified_score_total += calculate_mod_frame_score(cpi, twopass, oxcf, s);
-      ++s;
-    }
-
-    // The average error from this first scan is used to define the midpoint
-    // error for the rate distribution function.
-    twopass->mean_mod_score =
-        modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
-
-    // Second scan using clamps based on the previous cycle average.
-    // This may modify the total and average somewhat but we dont bother with
-    // further itterations.
-    s = twopass->stats_in;
-    modified_score_total = 0.0;
-    while (s < twopass->stats_in_end) {
-      modified_score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s);
-      ++s;
-    }
-    twopass->normalized_score_left = modified_score_total;
-  }
-
   // Reset the vbr bits off target counters
   rc->vbr_bits_off_target = 0;
   rc->vbr_bits_off_target_fast = 0;
@@ -1897,9 +1948,9 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats,
 }
 
 #define BASELINE_ERR_PER_MB 12500.0
+#define GF_MAX_BOOST 96.0
 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
-                               double *sr_accumulator,
-                               double this_frame_mv_in_out, double max_boost) {
+                               double this_frame_mv_in_out) {
   double frame_boost;
   const double lq = vp9_convert_qindex_to_q(
       cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth);
@@ -1908,13 +1959,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * active_area) /
-                DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator);
-
-  // Update the accumulator for second ref error difference.
-  // This is intended to give an indication of how much the coded error is
-  // increasing over time.
-  *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error);
-  *sr_accumulator = VPXMAX(0.0, *sr_accumulator);
+                DOUBLE_DIVIDE_CHECK(this_frame->coded_error);
 
   // Small adjustment for cases where there is a zoom out
   if (this_frame_mv_in_out > 0.0)
@@ -1923,7 +1968,7 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame,
   // Q correction and scalling
   frame_boost = frame_boost * boost_q_correction;
 
-  return VPXMIN(frame_boost, max_boost * boost_q_correction);
+  return VPXMIN(frame_boost, GF_MAX_BOOST * boost_q_correction);
 }
 
 #define KF_BASELINE_ERR_PER_MB 12500.0
@@ -1958,8 +2003,7 @@ static double calc_kf_frame_boost(VP9_COMP *cpi,
   return VPXMIN(frame_boost, max_boost * boost_q_correction);
 }
 
-static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
-                          int *f_boost, int *b_boost) {
+static int calc_arf_boost(VP9_COMP *cpi, int f_frames, int b_frames) {
   TWO_PASS *const twopass = &cpi->twopass;
   int i;
   double boost_score = 0.0;
@@ -1968,13 +2012,12 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double sr_accumulator = 0.0;
   int arf_boost;
   int flash_detected = 0;
 
   // Search forward from the proposed arf/next gf position.
   for (i = 0; i < f_frames; ++i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -1984,8 +2027,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
     // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
 
     // Accumulate the effect of prediction quality decay.
     if (!flash_detected) {
@@ -1994,14 +2036,11 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    sr_accumulator = 0.0;
     boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
+                   calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
   }
 
-  *f_boost = (int)boost_score;
+  arf_boost = (int)boost_score;
 
   // Reset for backward looking loop.
   boost_score = 0.0;
@@ -2010,11 +2049,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
   this_frame_mv_in_out = 0.0;
   mv_in_out_accumulator = 0.0;
   abs_mv_in_out_accumulator = 0.0;
-  sr_accumulator = 0.0;
 
   // Search backward towards last gf position.
   for (i = -1; i >= -b_frames; --i) {
-    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i + offset);
+    const FIRSTPASS_STATS *this_frame = read_frame_stats(twopass, i);
     if (this_frame == NULL) break;
 
     // Update the motion related elements to the boost calculation.
@@ -2024,8 +2062,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
 
     // We want to discount the the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
-    flash_detected = detect_flash(twopass, i + offset) ||
-                     detect_flash(twopass, i + offset + 1);
+    flash_detected = detect_flash(twopass, i) || detect_flash(twopass, i + 1);
 
     // Cumulative effect of prediction quality decay.
     if (!flash_detected) {
@@ -2034,17 +2071,13 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames,
                               ? MIN_DECAY_FACTOR
                               : decay_accumulator;
     }
-
-    sr_accumulator = 0.0;
     boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, this_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
+                   calc_frame_boost(cpi, this_frame, this_frame_mv_in_out);
   }
-  *b_boost = (int)boost_score;
+  arf_boost += (int)boost_score;
 
-  arf_boost = (*f_boost + *b_boost);
-  if (arf_boost < ((b_frames + f_frames) * 20))
-    arf_boost = ((b_frames + f_frames) * 20);
+  if (arf_boost < ((b_frames + f_frames) * 40))
+    arf_boost = ((b_frames + f_frames) * 40);
   arf_boost = VPXMAX(arf_boost, MIN_ARF_GF_BOOST);
 
   return arf_boost;
@@ -2105,7 +2138,7 @@ static int calculate_boost_bits(int frame_count, int boost,
   int allocation_chunks;
 
   // return 0 for invalid inputs (could arise e.g. through rounding errors)
-  if (!boost || (total_group_bits <= 0) || (frame_count <= 0)) return 0;
+  if (!boost || (total_group_bits <= 0) || (frame_count < 0)) return 0;
 
   allocation_chunks = (frame_count * 100) + boost;
 
@@ -2133,8 +2166,33 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
   arf_buffer_indices[1] = ARF_SLOT2;
 }
 
+// Used in corpus vbr: Calculates the total normalized group complexity score
+// for a given number of frames starting at the current position in the stats
+// file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+                                    int frame_count) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  TWO_PASS *const twopass = &cpi->twopass;
+  const FIRSTPASS_STATS *s = twopass->stats_in;
+  double score_total = 0.0;
+  int i = 0;
+
+  // We dont ever want to return a 0 score here.
+  if (frame_count == 0) return 1.0;
+
+  while ((i < frame_count) && (s < twopass->stats_in_end)) {
+    score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+    ++s;
+    ++i;
+  }
+  assert(i == frame_count);
+
+  return score_total;
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    int gf_arf_bits) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
   RATE_CONTROL *const rc = &cpi->rc;
   TWO_PASS *const twopass = &cpi->twopass;
   GF_GROUP *const gf_group = &twopass->gf_group;
@@ -2143,7 +2201,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   int frame_index = 1;
   int target_frame_size;
   int key_frame;
-  const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
+  const int max_bits = frame_max_bits(&cpi->rc, oxcf);
   int64_t total_group_bits = gf_group_bits;
   int mid_boost_bits = 0;
   int mid_frame_idx;
@@ -2153,8 +2211,10 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
   int normal_frames;
   int normal_frame_bits;
-  int last_frame_bits;
-  int last_frame_reduction;
+  int last_frame_reduction = 0;
+  double av_score = 1.0;
+  double tot_norm_frame_score = 1.0;
+  double this_frame_score = 1.0;
 
   // Only encode alt reference frame in temporal base layer.
   if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
@@ -2226,17 +2286,14 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
   normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
-
-  // The last frame in the group is used less as a predictor so reduce
-  // its allocation a little.
-  if (normal_frames > 1) {
+  if (normal_frames > 1)
     normal_frame_bits = (int)(total_group_bits / normal_frames);
-    last_frame_reduction = normal_frame_bits / 16;
-    last_frame_bits = normal_frame_bits - last_frame_reduction;
-  } else {
+  else
     normal_frame_bits = (int)total_group_bits;
-    last_frame_bits = normal_frame_bits;
-    last_frame_reduction = 0;
+
+  if (oxcf->vbr_corpus_complexity) {
+    av_score = get_distribution_av_err(cpi, twopass);
+    tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
   }
 
   // Allocate bits to the other frames in the group.
@@ -2248,11 +2305,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
       ++frame_index;
     }
 
-    target_frame_size = (i == (normal_frames - 1))
-                            ? last_frame_bits
-                            : (i == mid_frame_idx)
-                                  ? normal_frame_bits + last_frame_reduction
-                                  : normal_frame_bits;
+    if (oxcf->vbr_corpus_complexity) {
+      this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+                                                    &frame_stats, av_score);
+      normal_frame_bits = (int)((double)total_group_bits *
+                                (this_frame_score / tot_norm_frame_score));
+    }
+
+    target_frame_size = normal_frame_bits;
+    if ((i == (normal_frames - 1)) && (i >= 1)) {
+      last_frame_reduction = normal_frame_bits / 16;
+      target_frame_size -= last_frame_reduction;
+    }
 
     if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
       mid_boost_bits += (target_frame_size >> 4);
@@ -2273,6 +2337,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
     ++frame_index;
   }
 
+  // Add in some extra bits for the middle frame in the group.
+  gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
   // Note:
   // We need to configure the frame at the end of the sequence + 1 that will be
   // the start frame for the next group. Otherwise prior to the call to
@@ -2316,6 +2383,8 @@ static void adjust_group_arnr_filter(VP9_COMP *cpi, double section_noise,
 
 // Analyse and define a gf/arf group.
 #define ARF_DECAY_BREAKOUT 0.10
+#define ARF_ABS_ZOOM_THRESH 4.0
+
 static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2325,8 +2394,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const start_pos = twopass->stats_in;
   int i;
 
-  double boost_score = 0.0;
-  double old_boost_score = 0.0;
   double gf_group_err = 0.0;
   double gf_group_raw_error = 0.0;
   double gf_group_noise = 0.0;
@@ -2338,7 +2405,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mod_frame_err = 0.0;
 
   double mv_ratio_accumulator = 0.0;
-  double decay_accumulator = 1.0;
   double zero_motion_accumulator = 1.0;
   double loop_decay_rate = 1.00;
   double last_loop_decay_rate = 1.00;
@@ -2347,13 +2413,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
   double mv_ratio_accumulator_thresh;
-  double mv_in_out_thresh;
   double abs_mv_in_out_thresh;
   double sr_accumulator = 0.0;
+  const double av_err = get_distribution_av_err(cpi, twopass);
   unsigned int allow_alt_ref = is_altref_enabled(cpi);
 
-  int f_boost = 0;
-  int b_boost = 0;
   int flash_detected;
   int active_max_gf_interval;
   int active_min_gf_interval;
@@ -2372,7 +2436,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   vp9_zero(next_frame);
 
   // Load stats for the current frame.
-  mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+  mod_frame_err =
+      calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -2393,8 +2458,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Motion breakout threshold for loop below depends on image size.
   mv_ratio_accumulator_thresh =
       (cpi->initial_height + cpi->initial_width) / 4.0;
-  mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 300.0;
-  abs_mv_in_out_thresh = (cpi->initial_height + cpi->initial_width) / 200.0;
+  abs_mv_in_out_thresh = ARF_ABS_ZOOM_THRESH;
 
   // Set a maximum and minimum interval for the GF group.
   // If the image appears almost completely static we can extend beyond this.
@@ -2438,7 +2502,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+    mod_frame_err =
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
     gf_group_err += mod_frame_err;
     gf_group_raw_error += this_frame->coded_error;
     gf_group_noise += this_frame->frame_noise_energy;
@@ -2463,8 +2528,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       last_loop_decay_rate = loop_decay_rate;
       loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
 
-      decay_accumulator = decay_accumulator * loop_decay_rate;
-
       // Monitor for static sections.
       zero_motion_accumulator = VPXMIN(
           zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
@@ -2476,13 +2539,16 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         allow_alt_ref = 0;
         break;
       }
-    }
 
-    // Calculate a boost number for this frame.
-    sr_accumulator = 0.0;
-    boost_score += decay_accumulator *
-                   calc_frame_boost(cpi, &next_frame, &sr_accumulator,
-                                    this_frame_mv_in_out, GF_MAX_BOOST);
+      // Update the accumulator for second ref error difference.
+      // This is intended to give an indication of how much the coded error is
+      // increasing over time.
+      if (i == 1) {
+        sr_accumulator += next_frame.coded_error;
+      } else {
+        sr_accumulator += (next_frame.sr_coded_error - next_frame.coded_error);
+      }
+    }
 
     // Break out conditions.
     if (
@@ -2496,14 +2562,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             (!flash_detected) &&
             ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
              (abs_mv_in_out_accumulator > abs_mv_in_out_thresh) ||
-             (mv_in_out_accumulator < -mv_in_out_thresh) ||
-             (decay_accumulator < ARF_DECAY_BREAKOUT)))) {
-      boost_score = old_boost_score;
+             (sr_accumulator > next_frame.intra_error)))) {
       break;
     }
 
     *this_frame = next_frame;
-    old_boost_score = boost_score;
   }
 
   // Was the group length constrained by the requirement for a new KF?
@@ -2512,9 +2575,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Should we use the alternate reference frame.
   if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
       (i >= rc->min_gf_interval)) {
+    const int forward_frames = (rc->frames_to_key - i >= i - 1)
+                                   ? i - 1
+                                   : VPXMAX(0, rc->frames_to_key - i);
+
     // Calculate the boost for alt ref.
-    rc->gfu_boost =
-        calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
+    rc->gfu_boost = calc_arf_boost(cpi, forward_frames, (i - 1));
     rc->source_alt_ref_pending = 1;
 
     // Test to see if multi arf is appropriate.
@@ -2524,7 +2590,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
             ? 1
             : 0;
   } else {
-    rc->gfu_boost = VPXMAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1));
     rc->source_alt_ref_pending = 0;
   }
 
@@ -2548,7 +2614,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
       if (EOF == input_stats(twopass, this_frame)) break;
       gf_group_err +=
-          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
       gf_group_raw_error += this_frame->coded_error;
       gf_group_noise += this_frame->frame_noise_energy;
       gf_group_skip_pct += this_frame->intra_skip_pct;
@@ -2587,6 +2653,12 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         group_av_noise, vbr_group_bits_per_frame);
     twopass->active_worst_quality =
         (tmp_q + (twopass->active_worst_quality * 3)) >> 2;
+
+#if CONFIG_ALWAYS_ADJUST_BPM
+    // Reset rolling actual and target bits counters for ARF groups.
+    twopass->rolling_arf_group_target_bits = 0;
+    twopass->rolling_arf_group_actual_bits = 0;
+#endif
   }
 
   // Context Adjustment of ARNR filter strength
@@ -2621,10 +2693,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Default to starting GF groups at normal frame size.
     cpi->rc.next_frame_size_selector = UNSCALED;
   }
-
+#if !CONFIG_ALWAYS_ADJUST_BPM
   // Reset rolling actual and target bits counters for ARF groups.
   twopass->rolling_arf_group_target_bits = 0;
   twopass->rolling_arf_group_actual_bits = 0;
+#endif
 }
 
 // Threshold for use of the lagging second reference frame. High second ref
@@ -2769,7 +2842,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   double kf_group_err = 0.0;
   double recent_loop_decay[FRAMES_TO_CHECK_DECAY];
   double sr_accumulator = 0.0;
-
+  const double av_err = get_distribution_av_err(cpi, twopass);
   vp9_zero(next_frame);
 
   cpi->common.frame_type = KEY_FRAME;
@@ -2793,7 +2866,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_group_bits = 0;          // Total bits available to kf group
   twopass->kf_group_error_left = 0.0;  // Group modified error score.
 
-  kf_mod_err = calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+  kf_mod_err =
+      calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
   // Initialize the decay rates for the recent frames to check
   for (j = 0; j < FRAMES_TO_CHECK_DECAY; ++j) recent_loop_decay[j] = 1.0;
@@ -2803,7 +2877,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+    kf_group_err +=
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
@@ -2864,7 +2939,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
       kf_group_err +=
-          calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame);
+          calculate_norm_frame_score(cpi, twopass, oxcf, &tmp_frame, av_err);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -2882,7 +2957,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
       if (EOF == input_stats(twopass, this_frame)) break;
       kf_group_err +=
-          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+          calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
     }
     rc->frames_to_key = new_frame_to_key;
   }
@@ -2890,7 +2965,8 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_norm_frame_score(cpi, twopass, oxcf, this_frame);
+    kf_group_err +=
+        calculate_norm_frame_score(cpi, twopass, oxcf, this_frame, av_err);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.
diff --git a/libvpx/vp9/encoder/vp9_frame_scale.c b/libvpx/vp9/encoder/vp9_frame_scale.c
index e58628388..a410d0407 100644
--- a/libvpx/vp9/encoder/vp9_frame_scale.c
+++ b/libvpx/vp9/encoder/vp9_frame_scale.c
@@ -20,8 +20,6 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
                                   INTERP_FILTER filter_type, int phase_scaler) {
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
-  const int dst_w = dst->y_crop_width;
-  const int dst_h = dst->y_crop_height;
   const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
                                    src->v_buffer };
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -30,23 +28,86 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
   const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
   int x, y, i;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int factor = (i == 0 || i == 3 ? 1 : 2);
-    const int src_stride = src_strides[i];
-    const int dst_stride = dst_strides[i];
-    for (y = 0; y < dst_h; y += 16) {
-      const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
-      for (x = 0; x < dst_w; x += 16) {
-        const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
-        const uint8_t *src_ptr = srcs[i] +
-                                 (y / factor) * src_h / dst_h * src_stride +
-                                 (x / factor) * src_w / dst_w;
-        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+#if HAVE_SSSE3 || HAVE_NEON
+  // TODO(linfengz): The 4:3 specialized C code is disabled by default since
+  // it's much slower than the general version which calls vpx_scaled_2d() even
+  // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
+  // for the platforms which have faster optimization.
+  if (4 * dst->y_crop_width == 3 * src_w &&
+      4 * dst->y_crop_height == 3 * src_h) {
+    // Specialize 4 to 3 scaling.
+    // Example pixel locations.
+    // (O: Original pixel. S: Scaled pixel. X: Overlapped pixel.)
+    //      phase_scaler = 0               |      phase_scaler = 8
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //                                     |
+    //                                     |         S       S       S
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |
+    //                                     |
+    //                                     |
+    //      S       S       S       S      |
+    //                                     |
+    //      O     O     O     O     O      |      O     O     O     O     O
+    //                                     |         S       S       S
+    //                                     |
+    //                                     |
+    //                                     |
+    //                                     |
+    //      X     O S   O   S O     X      |      O     O     O     O     O
 
-        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor,
-                      16 / factor);
+    const int dst_ws[3] = { dst->y_crop_width, dst->uv_crop_width,
+                            dst->uv_crop_width };
+    const int dst_hs[3] = { dst->y_crop_height, dst->uv_crop_height,
+                            dst->uv_crop_height };
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int dst_w = dst_ws[i];
+      const int dst_h = dst_hs[i];
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 3) {
+        for (x = 0; x < dst_w; x += 3) {
+          const uint8_t *src_ptr = srcs[i] + 4 * y / 3 * src_stride + 4 * x / 3;
+          uint8_t *dst_ptr = dsts[i] + y * dst_stride + x;
+
+          // Must call c function because its optimization doesn't support 3x3.
+          vpx_scaled_2d_c(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                          phase_scaler, 64 / 3, phase_scaler, 64 / 3, 3, 3);
+        }
+      }
+    }
+  } else
+#endif
+  {
+    const int dst_w = dst->y_crop_width;
+    const int dst_h = dst->y_crop_height;
+    for (i = 0; i < MAX_MB_PLANE; ++i) {
+      const int factor = (i == 0 || i == 3 ? 1 : 2);
+      const int src_stride = src_strides[i];
+      const int dst_stride = dst_strides[i];
+      for (y = 0; y < dst_h; y += 16) {
+        const int y_q4 = y * (16 / factor) * src_h / dst_h + phase_scaler;
+        for (x = 0; x < dst_w; x += 16) {
+          const int x_q4 = x * (16 / factor) * src_w / dst_w + phase_scaler;
+          const uint8_t *src_ptr = srcs[i] +
+                                   (y / factor) * src_h / dst_h * src_stride +
+                                   (x / factor) * src_w / dst_w;
+          uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
+
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel,
+                        x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf,
+                        16 * src_h / dst_h, 16 / factor, 16 / factor);
+        }
       }
     }
   }
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index 24e23af3b..44f01be25 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -361,7 +361,7 @@ static unsigned int setup_center_error(
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-static INLINE int divide_and_round(const int n, const int d) {
+static INLINE int64_t divide_and_round(const int64_t n, const int64_t d) {
   return ((n < 0) ^ (d < 0)) ? ((n - d / 2) / d) : ((n + d / 2) / d);
 }
 
@@ -379,10 +379,13 @@ static INLINE int is_cost_list_wellbehaved(int *cost_list) {
 // y0 = 1/2 (S4 - S2)/(S4 + S2 - 2*S0).
 // The code below is an integerized version of that.
 static void get_cost_surf_min(int *cost_list, int *ir, int *ic, int bits) {
-  *ic = divide_and_round((cost_list[1] - cost_list[3]) * (1 << (bits - 1)),
-                         (cost_list[1] - 2 * cost_list[0] + cost_list[3]));
-  *ir = divide_and_round((cost_list[4] - cost_list[2]) * (1 << (bits - 1)),
-                         (cost_list[4] - 2 * cost_list[0] + cost_list[2]));
+  const int64_t x0 = (int64_t)cost_list[1] - cost_list[3];
+  const int64_t y0 = cost_list[1] - 2 * (int64_t)cost_list[0] + cost_list[3];
+  const int64_t x1 = (int64_t)cost_list[4] - cost_list[2];
+  const int64_t y1 = cost_list[4] - 2 * (int64_t)cost_list[0] + cost_list[2];
+  const int b = 1 << (bits - 1);
+  *ic = (int)divide_and_round(x0 * b, y0);
+  *ir = (int)divide_and_round(x1 * b, y1);
 }
 
 uint32_t vp9_skip_sub_pixel_tree(const MACROBLOCK *x, MV *bestmv,
@@ -441,7 +444,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore(
       cost_list[2] != INT_MAX && cost_list[3] != INT_MAX &&
       cost_list[4] != INT_MAX && is_cost_list_wellbehaved(cost_list)) {
     int ir, ic;
-    unsigned int minpt;
+    unsigned int minpt = INT_MAX;
     get_cost_surf_min(cost_list, &ir, &ic, 2);
     if (ir != 0 || ic != 0) {
       CHECK_BETTER(minpt, tr + 2 * ir, tc + 2 * ic);
@@ -2039,197 +2042,6 @@ static int full_pixel_exhaustive(VP9_COMP *cpi, MACROBLOCK *x,
   return bestsme;
 }
 
-int vp9_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r, c;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    for (c = col_min; c < col_max; ++c) {
-      const MV mv = { r, c };
-      const int sad =
-          fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
-                      in_what->stride) +
-          mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-      if (sad < best_sad) {
-        best_sad = sad;
-        *best_mv = mv;
-      }
-    }
-  }
-  return best_sad;
-}
-
-int vp9_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
-int vp9_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
-  int r;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  const int row_min = VPXMAX(ref_mv->row - distance, x->mv_limits.row_min);
-  const int row_max = VPXMIN(ref_mv->row + distance, x->mv_limits.row_max);
-  const int col_min = VPXMAX(ref_mv->col - distance, x->mv_limits.col_min);
-  const int col_max = VPXMIN(ref_mv->col + distance, x->mv_limits.col_max);
-  const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv),
-                  in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-  *best_mv = *ref_mv;
-
-  for (r = row_min; r < row_max; ++r) {
-    int c = col_min;
-    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
-
-    if (fn_ptr->sdx8f != NULL) {
-      while ((c + 7) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[8]);
-
-        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 8; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    if (fn_ptr->sdx3f != NULL) {
-      while ((c + 2) < col_max) {
-        int i;
-        DECLARE_ALIGNED(16, uint32_t, sads[3]);
-
-        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
-                      sads);
-
-        for (i = 0; i < 3; ++i) {
-          unsigned int sad = sads[i];
-          if (sad < best_sad) {
-            const MV mv = { r, c };
-            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-          ++check_here;
-          ++c;
-        }
-      }
-    }
-
-    while (c < col_max) {
-      unsigned int sad =
-          fn_ptr->sdf(what->buf, what->stride, check_here, in_what->stride);
-      if (sad < best_sad) {
-        const MV mv = { r, c };
-        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
-        if (sad < best_sad) {
-          best_sad = sad;
-          *best_mv = mv;
-        }
-      }
-      ++check_here;
-      ++c;
-    }
-  }
-
-  return best_sad;
-}
-
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
                             const vp9_variance_fn_ptr_t *fn_ptr,
diff --git a/libvpx/vp9/encoder/vp9_noise_estimate.c b/libvpx/vp9/encoder/vp9_noise_estimate.c
index e2239b44b..276a0c785 100644
--- a/libvpx/vp9/encoder/vp9_noise_estimate.c
+++ b/libvpx/vp9/encoder/vp9_noise_estimate.c
@@ -21,6 +21,15 @@
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_encoder.h"
 
+#if CONFIG_VP9_TEMPORAL_DENOISING
+// For SVC: only do noise estimation on top spatial layer.
+static INLINE int noise_est_svc(const struct VP9_COMP *const cpi) {
+  return (!cpi->use_svc ||
+          (cpi->use_svc &&
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1));
+}
+#endif
+
 void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   ne->enabled = 0;
   ne->level = kLowLow;
@@ -34,7 +43,7 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) {
   } else if (width * height >= 1280 * 720) {
     ne->thresh = 140;
   } else if (width * height >= 640 * 360) {
-    ne->thresh = 100;
+    ne->thresh = 115;
   }
   ne->num_frames_estimate = 15;
 }
@@ -45,7 +54,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
 #endif
 // Enable noise estimation if denoising is on.
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
       cpi->common.width >= 320 && cpi->common.height >= 180)
     return 1;
 #endif
@@ -56,8 +65,8 @@ static int enable_noise_estimation(VP9_COMP *const cpi) {
   if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_CBR &&
       cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 &&
       cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc &&
-      cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 &&
-      cpi->common.height >= 360)
+      cpi->oxcf.content != VP9E_CONTENT_SCREEN &&
+      cpi->common.width * cpi->common.height >= 640 * 360)
     return 1;
   else
     return 0;
@@ -111,7 +120,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
   // Estimate is between current source and last source.
   YV12_BUFFER_CONFIG *last_source = cpi->Last_Source;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi)) {
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi)) {
     last_source = &cpi->denoiser.last_source;
     // Tune these thresholds for different resolutions when denoising is
     // enabled.
@@ -131,7 +140,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
       (cpi->svc.number_spatial_layers == 1 &&
        (ne->last_w != cm->width || ne->last_h != cm->height))) {
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
       copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
     if (last_source != NULL) {
@@ -146,7 +155,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
     ne->count = 0;
     ne->num_frames_estimate = 10;
 #if CONFIG_VP9_TEMPORAL_DENOISING
-    if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
+    if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi) &&
         cpi->svc.current_superframe > 1) {
       vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
       copy_frame(&cpi->denoiser.last_source, cpi->Source);
@@ -190,44 +199,42 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
           int bl_index1 = bl_index + 1;
           int bl_index2 = bl_index + cm->mi_cols;
           int bl_index3 = bl_index2 + 1;
-          // Only consider blocks that are likely steady background. i.e, have
-          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
-          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
-          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
           int consec_zeromv =
               VPXMIN(cpi->consec_zero_mv[bl_index],
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          int is_skin = 0;
-          if (cpi->use_skin_detection) {
-            is_skin =
-                vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                       src_uvstride, bsize, consec_zeromv, 0);
-          }
-          if (frame_low_motion &&
-              cpi->consec_zero_mv[bl_index] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index1] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index2] > thresh_consec_zeromv &&
-              cpi->consec_zero_mv[bl_index3] > thresh_consec_zeromv &&
-              !is_skin) {
-            // Compute variance.
-            unsigned int sse;
-            unsigned int variance = cpi->fn_ptr[bsize].vf(
-                src_y, src_ystride, last_src_y, last_src_ystride, &sse);
-            // Only consider this block as valid for noise measurement if the
-            // average term (sse - variance = N * avg^{2}, N = 16X16) of the
-            // temporal residual is small (avoid effects from lighting change).
-            if ((sse - variance) < thresh_sum_diff) {
-              unsigned int sse2;
-              const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
-                  src_y, src_ystride, const_source, 0, &sse2);
-              // Avoid blocks with high brightness and high spatial variance.
-              if ((sse2 - spatial_variance) < thresh_sum_spatial &&
-                  spatial_variance < thresh_spatial_var) {
-                avg_est += low_res ? variance >> 4
-                                   : variance / ((spatial_variance >> 9) + 1);
-                num_samples++;
+          // Only consider blocks that are likely steady background. i.e, have
+          // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
+          // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
+          // 4 sub-blocks for 16x16 block. Also, avoid skin blocks.
+          if (frame_low_motion && consec_zeromv > thresh_consec_zeromv) {
+            int is_skin = 0;
+            if (cpi->use_skin_detection) {
+              is_skin =
+                  vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
+                                         src_uvstride, bsize, consec_zeromv, 0);
+            }
+            if (!is_skin) {
+              unsigned int sse;
+              // Compute variance.
+              unsigned int variance = cpi->fn_ptr[bsize].vf(
+                  src_y, src_ystride, last_src_y, last_src_ystride, &sse);
+              // Only consider this block as valid for noise measurement if the
+              // average term (sse - variance = N * avg^{2}, N = 16X16) of the
+              // temporal residual is small (avoid effects from lighting
+              // change).
+              if ((sse - variance) < thresh_sum_diff) {
+                unsigned int sse2;
+                const unsigned int spatial_variance = cpi->fn_ptr[bsize].vf(
+                    src_y, src_ystride, const_source, 0, &sse2);
+                // Avoid blocks with high brightness and high spatial variance.
+                if ((sse2 - spatial_variance) < thresh_sum_spatial &&
+                    spatial_variance < thresh_spatial_var) {
+                  avg_est += low_res ? variance >> 4
+                                     : variance / ((spatial_variance >> 9) + 1);
+                  num_samples++;
+                }
               }
             }
           }
@@ -259,14 +266,14 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
         ne->count = 0;
         ne->level = vp9_noise_estimate_extract_level(ne);
 #if CONFIG_VP9_TEMPORAL_DENOISING
-        if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+        if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
           vp9_denoiser_set_noise_level(&cpi->denoiser, ne->level);
 #endif
       }
     }
   }
 #if CONFIG_VP9_TEMPORAL_DENOISING
-  if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi))
+  if (cpi->oxcf.noise_sensitivity > 0 && noise_est_svc(cpi))
     copy_frame(&cpi->denoiser.last_source, cpi->Source);
 #endif
 }
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index b05f4184b..f2f323a28 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -158,6 +158,7 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const MvLimits tmp_mv_limits = x->mv_limits;
   int rv = 0;
   int cost_list[5];
+  int search_subpel = 1;
   const YV12_BUFFER_CONFIG *scaled_ref_frame =
       vp9_get_scaled_ref_frame(cpi, ref);
   if (scaled_ref_frame) {
@@ -192,9 +193,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   else
     center_mv = tmp_mv->as_mv;
 
-  vp9_full_pixel_search(
-      cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
-      cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+  if (x->sb_use_mv_part) {
+    tmp_mv->as_mv.row = x->sb_mvrow_part >> 3;
+    tmp_mv->as_mv.col = x->sb_mvcol_part >> 3;
+  } else {
+    vp9_full_pixel_search(
+        cpi, x, bsize, &mvp_full, step_param, cpi->sf.mv.search_method, sadpb,
+        cond_cost_list(cpi, cost_list), &center_mv, &tmp_mv->as_mv, INT_MAX, 0);
+  }
 
   x->mv_limits = tmp_mv_limits;
 
@@ -210,8 +216,14 @@ static int combined_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   rv =
       !(RDCOST(x->rdmult, x->rddiv, (*rate_mv + rate_mode), 0) > best_rd_sofar);
 
-  if (rv) {
-    const int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+  // For SVC on non-reference frame, avoid subpel for (0, 0) motion.
+  if (cpi->use_svc && cpi->svc.non_reference_frame) {
+    if (mvp_full.row == 0 && mvp_full.col == 0) search_subpel = 0;
+  }
+
+  if (rv && search_subpel) {
+    int subpel_force_stop = cpi->sf.mv.subpel_force_stop;
+    if (use_base_mv && cpi->sf.base_mv_aggressive) subpel_force_stop = 2;
     cpi->find_fractional_mv_step(
         x, &tmp_mv->as_mv, &ref_mv, cpi->common.allow_high_precision_mv,
         x->errorperbit, &cpi->fn_ptr[bsize], subpel_force_stop,
@@ -318,7 +330,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                     MACROBLOCK *x, MACROBLOCKD *xd,
                                     int *out_rate_sum, int64_t *out_dist_sum,
                                     unsigned int *var_y, unsigned int *sse_y,
-                                    int mi_row, int mi_col, int *early_term) {
+                                    int mi_row, int mi_col, int *early_term,
+                                    int *flag_preduv_computed) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -475,6 +488,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
         int j = i - 1;
 
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+        flag_preduv_computed[i - 1] = 1;
         var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
             p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
 
@@ -664,7 +678,9 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
 #endif
 
   if (cpi->sf.use_simple_block_yrd && cpi->common.frame_type != KEY_FRAME &&
-      bsize < BLOCK_32X32) {
+      (bsize < BLOCK_32X32 ||
+       (cpi->use_svc &&
+        (bsize < BLOCK_32X32 || cpi->svc.temporal_layer_id > 0)))) {
     unsigned int var_y, sse_y;
     (void)tx_size;
     if (!rd_computed)
@@ -711,7 +727,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
                             scan_order->iscan);
             break;
           case TX_4X4:
-            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            x->fwd_txfm4x4(src_diff, coeff, diff_stride);
             vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
                             qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
@@ -846,13 +862,11 @@ static void free_pred_buffer(PRED_BUFFER *p) {
   if (p != NULL) p->in_use = 0;
 }
 
-static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
-                                 int mi_row, int mi_col,
-                                 MV_REFERENCE_FRAME ref_frame,
-                                 PREDICTION_MODE this_mode, unsigned int var_y,
-                                 unsigned int sse_y,
-                                 struct buf_2d yv12_mb[][MAX_MB_PLANE],
-                                 int *rate, int64_t *dist) {
+static void encode_breakout_test(
+    VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
+    MV_REFERENCE_FRAME ref_frame, PREDICTION_MODE this_mode, unsigned int var_y,
+    unsigned int sse_y, struct buf_2d yv12_mb[][MAX_MB_PLANE], int *rate,
+    int64_t *dist, int *flag_preduv_computed) {
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   const BLOCK_SIZE uv_size = get_plane_block_size(bsize, &xd->plane[1]);
@@ -862,6 +876,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   // Skipping threshold for dc.
   unsigned int thresh_dc;
   int motion_low = 1;
+  if (cpi->use_svc && ref_frame == GOLDEN_FRAME) return;
   if (mi->mv[0].as_mv.row > 64 || mi->mv[0].as_mv.row < -64 ||
       mi->mv[0].as_mv.col > 64 || mi->mv[0].as_mv.col < -64)
     motion_low = 0;
@@ -912,9 +927,7 @@ static void encode_breakout_test(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
       thresh_dc_uv = 0;
     }
 
-    // Skip UV prediction unless breakout is zero (lossless) to save
-    // computation with low impact on the result
-    if (x->encode_breakout == 0) {
+    if (!flag_preduv_computed[0] || !flag_preduv_computed[1]) {
       xd->plane[1].pre[0] = yv12_mb[ref_frame][1];
       xd->plane[2].pre[0] = yv12_mb[ref_frame][2];
       vp9_build_inter_predictors_sbuv(xd, mi_row, mi_col, bsize);
@@ -1163,33 +1176,22 @@ static const REF_MODE ref_mode_set[RT_INTER_MODES] = {
   { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV },
   { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV }
 };
-static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = {
+
+#define RT_INTER_MODES_SVC 8
+static const REF_MODE ref_mode_set_svc[RT_INTER_MODES_SVC] = {
   { LAST_FRAME, ZEROMV },      { LAST_FRAME, NEARESTMV },
   { LAST_FRAME, NEARMV },      { GOLDEN_FRAME, ZEROMV },
   { GOLDEN_FRAME, NEARESTMV }, { GOLDEN_FRAME, NEARMV },
   { LAST_FRAME, NEWMV },       { GOLDEN_FRAME, NEWMV }
 };
 
-static int set_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize) {
-  const VP9_COMMON *const cm = &cpi->common;
-  // Reduce the intra cost penalty for small blocks (<=16x16).
-  int reduction_fac =
-      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
-  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
-    // Don't reduce intra cost penalty if estimated noise level is high.
-    reduction_fac = 0;
-  return vp9_get_intra_cost_penalty(cm->base_qindex, cm->y_dc_delta_q,
-                                    cm->bit_depth) >>
-         reduction_fac;
-}
-
 static INLINE void find_predictors(
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
     const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
-    int force_skip_low_temp_var) {
+    int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
@@ -1203,7 +1205,7 @@ static INLINE void find_predictors(
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
-    if (cm->use_prev_frame_mvs) {
+    if (cm->use_prev_frame_mvs || comp_pred_allowed) {
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        x->mbmi_ext->mode_context);
     } else {
@@ -1425,10 +1427,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   struct macroblockd_plane *const pd = &xd->plane[0];
   PREDICTION_MODE best_mode = ZEROMV;
   MV_REFERENCE_FRAME ref_frame, best_ref_frame = LAST_FRAME;
-  MV_REFERENCE_FRAME usable_ref_frame;
+  MV_REFERENCE_FRAME usable_ref_frame, second_ref_frame;
   TX_SIZE best_tx_size = TX_SIZES;
   INTERP_FILTER best_pred_filter = EIGHTTAP;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
+  uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
                                     VP9_ALT_FLAG };
@@ -1437,7 +1440,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
   unsigned int sse_y = UINT_MAX;
-  const int intra_cost_penalty = set_intra_cost_penalty(cpi, bsize);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int64_t inter_mode_thresh =
       RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0);
   const int *const rd_threshes = cpi->rd.threshes[mi->segment_id][bsize];
@@ -1483,6 +1487,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int force_skip_low_temp_var = 0;
   int skip_ref_find_pred[4] = { 0 };
   unsigned int sse_zeromv_normalized = UINT_MAX;
+  unsigned int best_sse_sofar = UINT_MAX;
   unsigned int thresh_svc_skip_golden = 500;
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_PICKMODE_CTX_DEN ctx_den;
@@ -1490,9 +1495,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   int denoise_svc_pickmode = 1;
 #endif
   INTERP_FILTER filter_gf_svc = EIGHTTAP;
+  MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+  int comp_modes = 0;
+  int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
+  int flag_svc_subpel = 0;
+  int svc_mv_col = 0;
+  int svc_mv_row = 0;
 
   init_ref_frame_cost(cm, xd, ref_frame_cost);
 
+  memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
+
   if (reuse_inter_pred) {
     int i;
     for (i = 0; i < 3; i++) {
@@ -1561,7 +1574,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   }
 #endif
 
-  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc) {
+  if (cpi->rc.frames_since_golden == 0 && !cpi->use_svc &&
+      !cpi->rc.alt_ref_gf_group && !cpi->rc.last_frame_is_src_altref) {
     usable_ref_frame = LAST_FRAME;
   } else {
     usable_ref_frame = GOLDEN_FRAME;
@@ -1575,6 +1589,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       skip_ref_find_pred[LAST_FRAME] = 1;
       skip_ref_find_pred[GOLDEN_FRAME] = 1;
     }
+    if (!cm->show_frame) {
+      if (cpi->rc.frames_since_key == 1) {
+        usable_ref_frame = LAST_FRAME;
+        skip_ref_find_pred[GOLDEN_FRAME] = 1;
+        skip_ref_find_pred[ALTREF_FRAME] = 1;
+      }
+    }
   }
 
   // For svc mode, on spatial_layer_id > 0: if the reference has different scale
@@ -1609,18 +1630,39 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
 
   if (cpi->oxcf.speed >= 8 && !cpi->use_svc &&
       ((cpi->rc.frames_since_golden + 1) < x->last_sb_high_content ||
-       x->last_sb_high_content > 40))
+       x->last_sb_high_content > 40 || cpi->rc.frames_since_golden > 120))
     usable_ref_frame = LAST_FRAME;
 
+  // Compound prediction modes: (0,0) on LAST/GOLDEN and ARF.
+  if (cm->reference_mode == REFERENCE_MODE_SELECT &&
+      cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
+    comp_modes = 2;
+
   for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
                       &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var);
+                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
+                      comp_modes > 0);
     }
   }
 
-  for (idx = 0; idx < RT_INTER_MODES; ++idx) {
+  if (cpi->use_svc || cpi->oxcf.speed <= 7 || bsize < BLOCK_32X32)
+    x->sb_use_mv_part = 0;
+
+  // Set the flag_svc_subpel to 1 for SVC if the lower spatial layer used
+  // an averaging filter for downsampling (phase = 8). If so, we will test
+  // a nonzero motion mode on the spatial (goldeen) reference.
+  // The nonzero motion is half pixel shifted to left and top (-4, -4).
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      svc_force_zero_mode[GOLDEN_FRAME - 1] &&
+      cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id - 1] == 8) {
+    svc_mv_col = -4;
+    svc_mv_row = -4;
+    flag_svc_subpel = 1;
+  }
+
+  for (idx = 0; idx < num_inter_modes + comp_modes; ++idx) {
     int rate_mv = 0;
     int mode_rd_thresh;
     int mode_index;
@@ -1629,17 +1671,56 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int is_skippable;
     int this_early_term = 0;
     int rd_computed = 0;
+    int flag_preduv_computed[2] = { 0 };
+    int inter_mv_mode = 0;
+    int skip_this_mv = 0;
+    int comp_pred = 0;
+    int force_gf_mv = 0;
+    PREDICTION_MODE this_mode;
+    second_ref_frame = NONE;
+
+    if (idx < num_inter_modes) {
+      this_mode = ref_mode_set[idx].pred_mode;
+      ref_frame = ref_mode_set[idx].ref_frame;
+
+      if (cpi->use_svc) {
+        this_mode = ref_mode_set_svc[idx].pred_mode;
+        ref_frame = ref_mode_set_svc[idx].ref_frame;
+      }
+    } else {
+      // Add (0,0) compound modes.
+      this_mode = ZEROMV;
+      ref_frame = LAST_FRAME;
+      if (idx == num_inter_modes + comp_modes - 1) ref_frame = GOLDEN_FRAME;
+      second_ref_frame = ALTREF_FRAME;
+      comp_pred = 1;
+    }
 
-    PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    if (ref_frame > usable_ref_frame) continue;
+    if (skip_ref_find_pred[ref_frame]) continue;
 
-    ref_frame = ref_mode_set[idx].ref_frame;
+    if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
+      force_gf_mv = 1;
+      // Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
+      // otherwise set NEWMV to (svc_mv_col, svc_mv_row).
+      if (this_mode == NEWMV) {
+        frame_mv[this_mode][ref_frame].as_mv.col = svc_mv_col;
+        frame_mv[this_mode][ref_frame].as_mv.row = svc_mv_row;
+      } else if (frame_mv[this_mode][ref_frame].as_mv.col != svc_mv_col ||
+                 frame_mv[this_mode][ref_frame].as_mv.row != svc_mv_row) {
+        continue;
+      }
+    }
 
-    if (cpi->use_svc) {
-      this_mode = ref_mode_set_svc[idx].pred_mode;
-      ref_frame = ref_mode_set_svc[idx].ref_frame;
+    if (comp_pred) {
+      const struct segmentation *const seg = &cm->seg;
+      if (!cpi->allow_comp_inter_inter) continue;
+      // Skip compound inter modes if ARF is not available.
+      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      // Do not allow compound prediction if the segment level reference frame
+      // feature is in use as in this case there can only be one reference.
+      if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
     }
-    if (ref_frame > usable_ref_frame) continue;
-    if (skip_ref_find_pred[ref_frame]) continue;
 
     // For SVC, skip the golden (spatial) reference search if sse of zeromv_last
     // is below threshold.
@@ -1660,13 +1741,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
            frame_mv[this_mode][ref_frame].as_int != 0))
         continue;
 
-      if (cpi->rc.alt_ref_gf_group &&
+      if (!cm->show_frame && ref_frame == ALTREF_FRAME &&
+          frame_mv[this_mode][ref_frame].as_int != 0)
+        continue;
+
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
           cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) &&
           ref_frame == GOLDEN_FRAME &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
 
-      if (cpi->rc.alt_ref_gf_group &&
+      if (cpi->rc.alt_ref_gf_group && cm->show_frame &&
+          cpi->rc.frames_since_golden > 0 &&
           cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) &&
           ref_frame == ALTREF_FRAME &&
           frame_mv[this_mode][ref_frame].as_int != 0)
@@ -1680,12 +1766,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     // Skip non-zeromv mode search for golden frame if force_skip_low_temp_var
     // is set. If nearestmv for golden frame is 0, zeromv mode will be skipped
     // later.
-    if (force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
+    if (!force_gf_mv && force_skip_low_temp_var && ref_frame == GOLDEN_FRAME &&
         frame_mv[this_mode][ref_frame].as_int != 0) {
       continue;
     }
 
-    if ((cpi->sf.short_circuit_low_temp_var >= 2 ||
+    if (x->content_state_sb != kVeryHighSad &&
+        (cpi->sf.short_circuit_low_temp_var >= 2 ||
          (cpi->sf.short_circuit_low_temp_var == 1 && bsize == BLOCK_64X64)) &&
         force_skip_low_temp_var && ref_frame == LAST_FRAME &&
         this_mode == NEWMV) {
@@ -1693,7 +1780,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     }
 
     if (cpi->use_svc) {
-      if (svc_force_zero_mode[ref_frame - 1] &&
+      if (!force_gf_mv && svc_force_zero_mode[ref_frame - 1] &&
           frame_mv[this_mode][ref_frame].as_int != 0)
         continue;
     }
@@ -1723,11 +1810,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (ref_frame_skip_mask & (1 << ref_frame)) continue;
 
     // Select prediction reference frames.
-    for (i = 0; i < MAX_MB_PLANE; i++)
+    for (i = 0; i < MAX_MB_PLANE; i++) {
       xd->plane[i].pre[0] = yv12_mb[ref_frame][i];
+      if (comp_pred) xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
+    }
 
     mi->ref_frame[0] = ref_frame;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    mi->ref_frame[1] = second_ref_frame;
+    set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
     mode_index = mode_idx[ref_frame][INTER_OFFSET(this_mode)];
     mode_rd_thresh = best_mode_skip_txfm ? rd_threshes[mode_index] << 1
@@ -1747,12 +1837,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
                              &rd_thresh_freq_fact[mode_index])))
       continue;
 
-    if (this_mode == NEWMV) {
+    if (this_mode == NEWMV && !force_gf_mv) {
       if (ref_frame > LAST_FRAME && !cpi->use_svc &&
           cpi->oxcf.rc_mode == VPX_CBR) {
         int tmp_sad;
         uint32_t dis;
-        int cost_list[5];
+        int cost_list[5] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX, INT_MAX };
 
         if (bsize < BLOCK_16X16) continue;
 
@@ -1780,17 +1870,37 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       } else if (svc->use_base_mv && svc->spatial_layer_id) {
         if (frame_mv[NEWMV][ref_frame].as_int != INVALID_MV) {
           const int pre_stride = xd->plane[0].pre[0].stride;
-          int base_mv_sad = INT_MAX;
-          const float base_mv_bias = sf->base_mv_aggressive ? 1.5f : 1.0f;
+          unsigned int base_mv_sse = UINT_MAX;
+          int scale = (cpi->rc.avg_frame_low_motion > 60) ? 2 : 4;
           const uint8_t *const pre_buf =
               xd->plane[0].pre[0].buf +
               (frame_mv[NEWMV][ref_frame].as_mv.row >> 3) * pre_stride +
               (frame_mv[NEWMV][ref_frame].as_mv.col >> 3);
-          base_mv_sad = cpi->fn_ptr[bsize].sdf(
-              x->plane[0].src.buf, x->plane[0].src.stride, pre_buf, pre_stride);
+          cpi->fn_ptr[bsize].vf(x->plane[0].src.buf, x->plane[0].src.stride,
+                                pre_buf, pre_stride, &base_mv_sse);
+
+          // Exit NEWMV search if base_mv is (0,0) && bsize < BLOCK_16x16,
+          // for SVC encoding.
+          if (cpi->use_svc && cpi->svc.use_base_mv && bsize < BLOCK_16X16 &&
+              frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+              frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+            continue;
 
-          if (base_mv_sad < (int)(base_mv_bias * x->pred_mv_sad[ref_frame])) {
+          // Exit NEWMV search if base_mv_sse is large.
+          if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+            continue;
+          if (base_mv_sse < (best_sse_sofar << 1)) {
             // Base layer mv is good.
+            // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
+            // (0, 0) mode is already tested.
+            unsigned int base_mv_sse_normalized =
+                base_mv_sse >>
+                (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+            if (sf->base_mv_aggressive && base_mv_sse <= best_sse_sofar &&
+                base_mv_sse_normalized < 400 &&
+                frame_mv[NEWMV][ref_frame].as_mv.row == 0 &&
+                frame_mv[NEWMV][ref_frame].as_mv.col == 0)
+              continue;
             if (!combined_motion_search(cpi, x, bsize, mi_row, mi_col,
                                         &frame_mv[NEWMV][ref_frame], &rate_mv,
                                         best_rdc.rdcost, 1)) {
@@ -1813,6 +1923,22 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       }
     }
 
+    // TODO(jianj): Skipping the testing of (duplicate) non-zero motion vector
+    // causes some regression, leave it for duplicate zero-mv for now, until
+    // regression issue is resolved.
+    for (inter_mv_mode = NEARESTMV; inter_mv_mode <= NEWMV; inter_mv_mode++) {
+      if (inter_mv_mode == this_mode || comp_pred) continue;
+      if (mode_checked[inter_mv_mode][ref_frame] &&
+          frame_mv[this_mode][ref_frame].as_int ==
+              frame_mv[inter_mv_mode][ref_frame].as_int &&
+          frame_mv[inter_mv_mode][ref_frame].as_int == 0) {
+        skip_this_mv = 1;
+        break;
+      }
+    }
+
+    if (skip_this_mv) continue;
+
     // If use_golden_nonzeromv is false, NEWMV mode is skipped for golden, no
     // need to compute best_pred_sad which is only used to skip golden NEWMV.
     if (use_golden_nonzeromv && this_mode == NEWMV && ref_frame == LAST_FRAME &&
@@ -1827,13 +1953,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       x->pred_mv_sad[LAST_FRAME] = best_pred_sad;
     }
 
-    if (this_mode != NEARESTMV &&
+    if (this_mode != NEARESTMV && !comp_pred &&
         frame_mv[this_mode][ref_frame].as_int ==
             frame_mv[NEARESTMV][ref_frame].as_int)
       continue;
 
     mi->mode = this_mode;
     mi->mv[0].as_int = frame_mv[this_mode][ref_frame].as_int;
+    mi->mv[1].as_int = 0;
 
     // Search for the best prediction filter type, when the resulting
     // motion vector is at sub-pixel accuracy level for luma component, i.e.,
@@ -1851,7 +1978,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if ((this_mode == NEWMV || filter_ref == SWITCHABLE) &&
         pred_filter_search &&
         (ref_frame == LAST_FRAME ||
-         (ref_frame == GOLDEN_FRAME &&
+         (ref_frame == GOLDEN_FRAME && !force_gf_mv &&
           (cpi->use_svc || cpi->oxcf.rc_mode == VPX_VBR))) &&
         (((mi->mv[0].as_mv.row | mi->mv[0].as_mv.col) & 0x07) != 0)) {
       int pf_rate[3];
@@ -1907,9 +2034,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         pd->dst.stride = this_mode_pred->stride;
       }
     } else {
-      const int large_block = (x->sb_is_skin || cpi->oxcf.speed < 7)
-                                  ? bsize > BLOCK_32X32
-                                  : bsize >= BLOCK_32X32;
+      // For low motion content use x->sb_is_skin in addition to VeryHighSad
+      // for setting large_block.
+      const int large_block =
+          (x->content_state_sb == kVeryHighSad ||
+           (x->sb_is_skin && cpi->rc.avg_frame_low_motion > 70) ||
+           cpi->oxcf.speed < 7)
+              ? bsize > BLOCK_32X32
+              : bsize >= BLOCK_32X32;
       mi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
 
       if (cpi->use_svc && ref_frame == GOLDEN_FRAME &&
@@ -1924,7 +2056,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
           cm->base_qindex) {
         model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
                                 &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
-                                &this_early_term);
+                                &this_early_term, flag_preduv_computed);
       } else {
         rd_computed = 1;
         model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
@@ -1936,6 +2068,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         sse_zeromv_normalized =
             sse_y >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
       }
+      if (sse_y < best_sse_sofar) best_sse_sofar = sse_y;
     }
 
     if (!this_early_term) {
@@ -1968,13 +2101,18 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
     }
 
-    if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
+    if (!this_early_term &&
+        (x->color_sensitivity[0] || x->color_sensitivity[1])) {
       RD_COST rdc_uv;
       const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, &xd->plane[1]);
-      if (x->color_sensitivity[0])
+      if (x->color_sensitivity[0] && !flag_preduv_computed[0]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
-      if (x->color_sensitivity[1])
+        flag_preduv_computed[0] = 1;
+      }
+      if (x->color_sensitivity[1] && !flag_preduv_computed[1]) {
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
+        flag_preduv_computed[1] = 1;
+      }
       model_rd_for_sb_uv(cpi, uv_bsize, x, xd, &rdc_uv, &var_y, &sse_y, 1, 2);
       this_rdc.rate += rdc_uv.rate;
       this_rdc.dist += rdc_uv.dist;
@@ -1983,6 +2121,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     this_rdc.rate += rate_mv;
     this_rdc.rate += cpi->inter_mode_cost[x->mbmi_ext->mode_context[ref_frame]]
                                          [INTER_OFFSET(this_mode)];
+    // TODO(marpan): Add costing for compound mode.
     this_rdc.rate += ref_frame_cost[ref_frame];
     this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
 
@@ -2002,7 +2141,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     if (cpi->allow_encode_breakout) {
       encode_breakout_test(cpi, x, bsize, mi_row, mi_col, ref_frame, this_mode,
                            var_y, sse_y, yv12_mb, &this_rdc.rate,
-                           &this_rdc.dist);
+                           &this_rdc.dist, flag_preduv_computed);
       if (x->skip) {
         this_rdc.rate += rate_mv;
         this_rdc.rdcost =
@@ -2022,6 +2161,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     (void)ctx;
 #endif
 
+    mode_checked[this_mode][ref_frame] = 1;
+
     if (this_rdc.rdcost < best_rdc.rdcost || x->skip) {
       best_rdc = this_rdc;
       best_mode = this_mode;
@@ -2030,6 +2171,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       best_ref_frame = ref_frame;
       best_mode_skip_txfm = x->skip_txfm[0];
       best_early_term = this_early_term;
+      best_second_ref_frame = second_ref_frame;
 
       if (reuse_inter_pred) {
         free_pred_buffer(best_pred);
@@ -2056,6 +2198,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   mi->mv[0].as_int = frame_mv[best_mode][best_ref_frame].as_int;
   xd->mi[0]->bmi[0].as_mv[0].as_int = mi->mv[0].as_int;
   x->skip_txfm[0] = best_mode_skip_txfm;
+  mi->ref_frame[1] = best_second_ref_frame;
 
   // For spatial enhancemanent layer: perform intra prediction only if base
   // layer is chosen as the reference. Always perform intra prediction if
@@ -2074,7 +2217,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   // Perform intra prediction search, if the best SAD is above a certain
   // threshold.
   if (best_rdc.rdcost == INT64_MAX ||
-      ((!force_skip_low_temp_var || bsize < BLOCK_32X32) &&
+      ((!force_skip_low_temp_var || bsize < BLOCK_32X32 ||
+        x->content_state_sb == kVeryHighSad) &&
        perform_intra_pred && !x->skip && best_rdc.rdcost > inter_mode_thresh &&
        bsize <= cpi->sf.max_intra_bsize && !x->skip_low_source_sad &&
        !x->lowvar_highsumdiff)) {
@@ -2095,15 +2239,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
           vpx_highbd_convolve_copy(
               CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
               CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride,
-              NULL, 0, NULL, 0, bw, bh, xd->bd);
+              NULL, 0, 0, 0, 0, bw, bh, xd->bd);
         else
           vpx_convolve_copy(best_pred->data, best_pred->stride,
                             this_mode_pred->data, this_mode_pred->stride, NULL,
-                            0, NULL, 0, bw, bh);
+                            0, 0, 0, 0, bw, bh);
 #else
         vpx_convolve_copy(best_pred->data, best_pred->stride,
                           this_mode_pred->data, this_mode_pred->stride, NULL, 0,
-                          NULL, 0, bw, bh);
+                          0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         best_pred = this_mode_pred;
       }
@@ -2168,8 +2312,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         best_mode = this_mode;
         best_intra_tx_size = mi->tx_size;
         best_ref_frame = INTRA_FRAME;
+        best_second_ref_frame = NONE;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
+        mi->mv[1].as_int = INVALID_MV;
         best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
@@ -2185,6 +2331,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   pd->dst = orig_dst;
   mi->mode = best_mode;
   mi->ref_frame[0] = best_ref_frame;
+  mi->ref_frame[1] = best_second_ref_frame;
   x->skip_txfm[0] = best_mode_skip_txfm;
 
   if (!is_inter_block(mi)) {
@@ -2197,14 +2344,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       if (cm->use_highbitdepth)
         vpx_highbd_convolve_copy(
             CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride,
-            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0,
+            CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0,
             bw, bh, xd->bd);
       else
         vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                          pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                          pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #else
       vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf,
-                        pd->dst.stride, NULL, 0, NULL, 0, bw, bh);
+                        pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     }
   }
@@ -2214,6 +2361,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
       denoise_svc_pickmode && cpi->denoiser.denoising_level > kDenLowLow &&
       cpi->denoiser.reset == 0) {
     VP9_DENOISER_DECISION decision = COPY_BLOCK;
+    ctx->sb_skip_denoising = 0;
+    // TODO(marpan): There is an issue with denoising when the
+    // superblock partitioning scheme is based on the pickmode.
+    // Remove this condition when the issue is resolved.
+    if (x->sb_pickmode_part) ctx->sb_skip_denoising = 1;
     vp9_pickmode_ctx_den_update(&ctx_den, zero_last_cost_orig, ref_frame_cost,
                                 frame_mv, reuse_inter_pred, best_tx_size,
                                 best_mode, best_ref_frame, best_pred_filter,
@@ -2225,6 +2377,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   }
 #endif
 
+  if (best_ref_frame == ALTREF_FRAME || best_second_ref_frame == ALTREF_FRAME)
+    x->arf_frame_usage++;
+  else if (best_ref_frame != INTRA_FRAME)
+    x->lastgolden_frame_usage++;
+
   if (cpi->sf.adaptive_rd_thresh) {
     THR_MODES best_mode_idx = mode_idx[best_ref_frame][mode_offset(mi->mode)];
 
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index f2a59a4af..09f61ead2 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_mem/vpx_mem.h"
@@ -28,33 +29,33 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-      tmp = (tmp * quant_ptr[rc != 0]) >> 16;
+    int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+    tmp = (tmp * quant_ptr[rc != 0]) >> 16;
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
 
-      if (tmp) eob = i;
-    }
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                               int skip_block, const int16_t *round_ptr,
                               const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                               tran_low_t *dqcoeff_ptr,
@@ -64,24 +65,24 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count,
   int eob = -1;
 
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  if (!skip_block) {
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-      if (abs_qcoeff) eob = i;
-    }
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[rc != 0];
+    const int abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 16);
+    qcoeff_ptr[rc] = (tran_low_t)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -97,28 +98,28 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp = 0;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-        tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (tmp) eob = i;
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp = 0;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+      tmp = (abs_coeff * quant_ptr[rc != 0]) >> 15;
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (tmp) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -132,28 +133,27 @@ void vp9_highbd_quantize_fp_32x32_c(
   int i, eob = -1;
 
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        const int64_t tmp =
-            abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 15);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      }
-
-      if (abs_qcoeff) eob = i;
+  for (i = 0; i < n_coeffs; i++) {
+    int abs_qcoeff = 0;
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
+      const int64_t tmp = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+      abs_qcoeff = (int)((tmp * quant_ptr[rc != 0]) >> 15);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     }
+
+    if (abs_qcoeff) eob = i;
   }
   *eob_ptr = eob + 1;
 }
@@ -164,22 +164,28 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
+             *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const int n_coeffs = 4 * 4;
+
+  if (x->skip_block) {
+    memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
+    memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
+    return;
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block,
-                          p->zbin, p->round, p->quant, p->quant_shift,
-                          BLOCK_OFFSET(p->qcoeff, block),
-                          BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant,
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
+                          x->skip_block, p->zbin, p->round, p->quant,
+                          p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                           &p->eobs[block], scan, iscan);
     return;
   }
 #endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin,
-                 p->round, p->quant, p->quant_shift,
-                 BLOCK_OFFSET(p->qcoeff, block),
-                 BLOCK_OFFSET(pd->dqcoeff, block), pd->dequant, &p->eobs[block],
-                 scan, iscan);
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
+                 p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+                 pd->dequant, &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 27fea5d4e..b7f3a0e89 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -44,11 +44,6 @@
 #define MIN_BPB_FACTOR 0.005
 #define MAX_BPB_FACTOR 50
 
-#define FRAME_OVERHEAD_BITS 200
-
-// Use this macro to turn on/off use of alt-refs in one-pass vbr mode.
-#define USE_ALTREF_FOR_ONE_PASS 0
-
 #if CONFIG_VP9_HIGHBITDEPTH
 #define ASSIGN_MINQ_TABLE(bit_depth, name)                   \
   do {                                                       \
@@ -209,24 +204,29 @@ int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs,
   const int bpm =
       (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, bit_depth));
   return VPXMAX(FRAME_OVERHEAD_BITS,
-                (int)((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS);
+                (int)(((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS));
 }
 
 int vp9_rc_clamp_pframe_target_size(const VP9_COMP *const cpi, int target) {
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
-  const int min_frame_target =
-      VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
-  if (target < min_frame_target) target = min_frame_target;
-  if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
-    // If there is an active ARF at this location use the minimum
-    // bits on this frame even if it is a constructed arf.
-    // The active maximum quantizer insures that an appropriate
-    // number of bits will be spent if needed for constructed ARFs.
-    target = min_frame_target;
+
+  if (cpi->oxcf.pass != 2) {
+    const int min_frame_target =
+        VPXMAX(rc->min_frame_bandwidth, rc->avg_frame_bandwidth >> 5);
+    if (target < min_frame_target) target = min_frame_target;
+    if (cpi->refresh_golden_frame && rc->is_src_frame_alt_ref) {
+      // If there is an active ARF at this location use the minimum
+      // bits on this frame even if it is a constructed arf.
+      // The active maximum quantizer insures that an appropriate
+      // number of bits will be spent if needed for constructed ARFs.
+      target = min_frame_target;
+    }
   }
+
   // Clip the frame target to the maximum allowed value.
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
+
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate =
         rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
@@ -353,8 +353,10 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   rc->af_ratio_onepass_vbr = 10;
   rc->prev_avg_source_sad_lag = 0;
   rc->high_source_sad = 0;
+  rc->reset_high_source_sad = 0;
   rc->high_source_sad_lagindex = -1;
   rc->alt_ref_gf_group = 0;
+  rc->last_frame_is_src_altref = 0;
   rc->fac_active_worst_inter = 150;
   rc->fac_active_worst_gf = 100;
   rc->force_qpmin = 0;
@@ -585,7 +587,7 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
 
   // In CBR mode, this makes sure q is between oscillating Qs to prevent
   // resonance.
-  if (cpi->oxcf.rc_mode == VPX_CBR &&
+  if (cpi->oxcf.rc_mode == VPX_CBR && !cpi->rc.reset_high_source_sad &&
       (!cpi->oxcf.gf_cbr_boost_pct ||
        !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) &&
       (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) &&
@@ -593,13 +595,6 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame,
     q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame),
               VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame));
   }
-#if USE_ALTREF_FOR_ONE_PASS
-  if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 &&
-      cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 &&
-      cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) {
-    q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1);
-  }
-#endif
   return q;
 }
 
@@ -679,7 +674,8 @@ static int calc_active_worst_quality_one_pass_cbr(const VP9_COMP *cpi) {
   int active_worst_quality;
   int ambient_qp;
   unsigned int num_frames_weight_key = 5 * cpi->svc.number_temporal_layers;
-  if (cm->frame_type == KEY_FRAME) return rc->worst_quality;
+  if (cm->frame_type == KEY_FRAME || rc->reset_high_source_sad)
+    return rc->worst_quality;
   // For ambient_qp we use minimum of avg_frame_qindex[KEY_FRAME/INTER_FRAME]
   // for the first few frames following key frame. These are both initialized
   // to worst_quality and updated with (3/4, 1/4) average in postencode_update.
@@ -1011,6 +1007,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
       qdelta = vp9_compute_qdelta_by_rate(
           &cpi->rc, cm->frame_type, active_worst_quality, 1.75, cm->bit_depth);
     }
+    if (rc->high_source_sad && cpi->sf.use_altref_onepass) qdelta = 0;
     *top_index = active_worst_quality + qdelta;
     *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index;
   }
@@ -1339,6 +1336,28 @@ static void update_golden_frame_stats(VP9_COMP *cpi) {
   }
 }
 
+static void update_altref_usage(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  int sum_ref_frame_usage = 0;
+  int arf_frame_usage = 0;
+  int mi_row, mi_col;
+  if (cpi->rc.alt_ref_gf_group && !cpi->rc.is_src_frame_alt_ref &&
+      !cpi->refresh_golden_frame && !cpi->refresh_alt_ref_frame)
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8) {
+        int sboffset = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
+        sum_ref_frame_usage += cpi->count_arf_frame_usage[sboffset] +
+                               cpi->count_lastgolden_frame_usage[sboffset];
+        arf_frame_usage += cpi->count_arf_frame_usage[sboffset];
+      }
+    }
+  if (sum_ref_frame_usage > 0) {
+    double altref_count = 100.0 * arf_frame_usage / sum_ref_frame_usage;
+    cpi->rc.perc_arf_usage =
+        0.75 * cpi->rc.perc_arf_usage + 0.25 * altref_count;
+  }
+}
+
 static void compute_frame_low_motion(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int mi_row, mi_col;
@@ -1462,8 +1481,15 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   }
 
   if (oxcf->pass == 0) {
-    if (cm->frame_type != KEY_FRAME) compute_frame_low_motion(cpi);
+    if (cm->frame_type != KEY_FRAME) {
+      compute_frame_low_motion(cpi);
+      if (cpi->sf.use_altref_onepass) update_altref_usage(cpi);
+    }
+    cpi->rc.last_frame_is_src_altref = cpi->rc.is_src_frame_alt_ref;
   }
+  if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
+
+  rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1556,8 +1582,9 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
       // Adjust boost and af_ratio based on avg_frame_low_motion, which varies
       // between 0 and 100 (stationary, 100% zero/small motion).
       rc->gfu_boost =
-          VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
-                          (rc->avg_frame_low_motion + 100));
+          VPXMAX(500,
+                 DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+                     (rc->avg_frame_low_motion + 100));
       rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
     }
     adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1565,12 +1592,10 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
     cpi->refresh_golden_frame = 1;
     rc->source_alt_ref_pending = 0;
     rc->alt_ref_gf_group = 0;
-#if USE_ALTREF_FOR_ONE_PASS
-    if (cpi->oxcf.enable_auto_arf) {
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
       rc->source_alt_ref_pending = 1;
       rc->alt_ref_gf_group = 1;
     }
-#endif
   }
   if (cm->frame_type == KEY_FRAME)
     target = calc_iframe_target_size_one_pass_vbr(cpi);
@@ -1847,6 +1872,26 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
 
     // Clamp min to max
     rc->min_gf_interval = VPXMIN(rc->min_gf_interval, rc->max_gf_interval);
+
+    if (oxcf->target_level == LEVEL_AUTO) {
+      const uint32_t pic_size = cpi->common.width * cpi->common.height;
+      const uint32_t pic_breadth =
+          VPXMAX(cpi->common.width, cpi->common.height);
+      int i;
+      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+        if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
+            vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
+          if (rc->min_gf_interval <=
+              (int)vp9_level_defs[i].min_altref_distance) {
+            rc->min_gf_interval =
+                (int)vp9_level_defs[i].min_altref_distance + 1;
+            rc->max_gf_interval =
+                VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
+          }
+          break;
+        }
+      }
+    }
   }
 }
 
@@ -1933,9 +1978,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
   else
     target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
 
-  // Correction to rate target based on prior over or under shoot.
-  if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
-    vbr_rate_correction(cpi, &target_rate);
+  if (!cpi->oxcf.vbr_corpus_complexity) {
+    // Correction to rate target based on prior over or under shoot.
+    if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
+      vbr_rate_correction(cpi, &target_rate);
+  }
   vp9_rc_set_frame_target(cpi, target_rate);
 }
 
@@ -2070,7 +2117,8 @@ int vp9_resize_one_pass_cbr(VP9_COMP *cpi) {
   return resize_action;
 }
 
-void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
+static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
+                                             uint64_t avg_sad_current) {
   VP9_COMMON *const cm = &cpi->common;
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
@@ -2081,7 +2129,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
   uint64_t avg_source_sad_lag = avg_sad_current;
   int high_source_sad_lagindex = -1;
   int steady_sad_lagindex = -1;
-  uint32_t sad_thresh1 = 60000;
+  uint32_t sad_thresh1 = 70000;
   uint32_t sad_thresh2 = 120000;
   int low_content = 0;
   int high_content = 0;
@@ -2185,11 +2233,16 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
       rc->af_ratio_onepass_vbr = 5;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 2;
     }
-#if USE_ALTREF_FOR_ONE_PASS
-    if (cpi->oxcf.enable_auto_arf) {
-      // Don't use alt-ref if there is a scene cut within the group,
-      // or content is not low.
-      if ((rc->high_source_sad_lagindex > 0 &&
+    if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) {
+      // Flag to disable usage of ARF based on past usage, only allow this
+      // disabling if current frame/group does not start with key frame or
+      // scene cut. Note perc_arf_usage is only computed for speed >= 5.
+      int arf_usage_low =
+          (cm->frame_type != KEY_FRAME && !rc->high_source_sad &&
+           cpi->rc.perc_arf_usage < 15 && cpi->oxcf.speed >= 5);
+      // Don't use alt-ref for this group under certain conditions.
+      if (arf_usage_low ||
+          (rc->high_source_sad_lagindex > 0 &&
            rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) ||
           (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) {
         rc->source_alt_ref_pending = 0;
@@ -2198,12 +2251,12 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) {
         rc->source_alt_ref_pending = 1;
         rc->alt_ref_gf_group = 1;
         // If alt-ref is used for this gf group, limit the interval.
-        if (rc->baseline_gf_interval > 10 &&
-            rc->baseline_gf_interval < rc->frames_to_key)
-          rc->baseline_gf_interval = 10;
+        if (rc->baseline_gf_interval > 12) {
+          rc->baseline_gf_interval = 12;
+          rc->frames_till_gf_update_due = rc->baseline_gf_interval;
+        }
       }
     }
-#endif
     target = calc_pframe_target_size_one_pass_vbr(cpi);
     vp9_rc_set_frame_target(cpi, target);
   }
@@ -2233,11 +2286,14 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
     int start_frame = 0;
     int frames_to_buffer = 1;
     int frame = 0;
+    int scene_cut_force_key_frame = 0;
     uint64_t avg_sad_current = 0;
     uint32_t min_thresh = 4000;
     float thresh = 8.0f;
+    uint32_t thresh_key = 140000;
+    if (cpi->oxcf.speed <= 5) thresh_key = 240000;
     if (cpi->oxcf.rc_mode == VPX_VBR) {
-      min_thresh = 60000;
+      min_thresh = 65000;
       thresh = 2.1f;
     }
     if (cpi->oxcf.lag_in_frames > 0) {
@@ -2263,6 +2319,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
         rc->high_source_sad = 1;
       else
         rc->high_source_sad = 0;
+      if (rc->high_source_sad && avg_sad_current > thresh_key)
+        scene_cut_force_key_frame = 1;
       // Update recursive average for current frame.
       if (avg_sad_current > 0)
         rc->avg_source_sad[0] =
@@ -2323,6 +2381,8 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
             rc->high_source_sad = 1;
           else
             rc->high_source_sad = 0;
+          if (rc->high_source_sad && avg_sad > thresh_key)
+            scene_cut_force_key_frame = 1;
           if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
             rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
         } else {
@@ -2330,6 +2390,23 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
         }
       }
     }
+    // For CBR non-screen content mode, check if we should reset the rate
+    // control. Reset is done if high_source_sad is detected and the rate
+    // control is at very low QP with rate correction factor at min level.
+    if (cpi->oxcf.rc_mode == VPX_CBR &&
+        cpi->oxcf.content != VP9E_CONTENT_SCREEN && !cpi->use_svc) {
+      if (rc->high_source_sad && rc->last_q[INTER_FRAME] == rc->best_quality &&
+          rc->avg_frame_qindex[INTER_FRAME] < (rc->best_quality << 1) &&
+          rc->rate_correction_factors[INTER_NORMAL] == MIN_BPB_FACTOR) {
+        rc->rate_correction_factors[INTER_NORMAL] = 0.5;
+        rc->avg_frame_qindex[INTER_FRAME] = rc->worst_quality;
+        rc->buffer_level = rc->optimal_buffer_level;
+        rc->bits_off_target = rc->optimal_buffer_level;
+        rc->reset_high_source_sad = 1;
+      }
+      if (cm->frame_type != KEY_FRAME && rc->reset_high_source_sad)
+        rc->this_frame_target = rc->avg_frame_bandwidth;
+    }
     // For VBR, under scene change/high content change, force golden refresh.
     if (cpi->oxcf.rc_mode == VPX_VBR && cm->frame_type != KEY_FRAME &&
         rc->high_source_sad && rc->frames_to_key > 3 &&
@@ -2337,10 +2414,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
         cpi->ext_refresh_frame_flags_pending == 0) {
       int target;
       cpi->refresh_golden_frame = 1;
+      if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
       rc->source_alt_ref_pending = 0;
-#if USE_ALTREF_FOR_ONE_PASS
-      if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1;
-#endif
+      if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
+        rc->source_alt_ref_pending = 1;
       rc->gfu_boost = DEFAULT_GF_BOOST >> 1;
       rc->baseline_gf_interval =
           VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval));
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 9e4623195..c1b210677 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -32,6 +32,8 @@ extern "C" {
 #define FIXED_GF_INTERVAL 8  // Used in some testing modes only
 #define ONEHALFONLY_RESIZE 0
 
+#define FRAME_OVERHEAD_BITS 200
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -150,6 +152,8 @@ typedef struct {
   int rc_2_frame;
   int q_1_frame;
   int q_2_frame;
+  // Keep track of the last target average frame bandwidth.
+  int last_avg_frame_bandwidth;
 
   // Auto frame-scaling variables.
   FRAME_SCALE_LEVEL frame_size_selector;
@@ -164,11 +168,14 @@ typedef struct {
   uint64_t prev_avg_source_sad_lag;
   int high_source_sad_lagindex;
   int alt_ref_gf_group;
+  int last_frame_is_src_altref;
   int high_source_sad;
   int count_last_scene_change;
   int avg_frame_low_motion;
   int af_ratio_onepass_vbr;
   int force_qpmin;
+  int reset_high_source_sad;
+  double perc_arf_usage;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c
index 39a7742f0..6b2306ce9 100644
--- a/libvpx/vp9/encoder/vp9_rd.c
+++ b/libvpx/vp9/encoder/vp9_rd.c
@@ -670,19 +670,21 @@ void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
   }
 }
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth) {
-  const int q = vp9_dc_quant(qindex, qdelta, bit_depth);
-#if CONFIG_VP9_HIGHBITDEPTH
-  switch (bit_depth) {
-    case VPX_BITS_8: return 20 * q;
-    case VPX_BITS_10: return 5 * q;
-    case VPX_BITS_12: return ROUND_POWER_OF_TWO(5 * q, 2);
-    default:
-      assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
-      return -1;
-  }
-#else
-  return 20 * q;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+int vp9_get_intra_cost_penalty(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                               int qindex, int qdelta) {
+  // Reduce the intra cost penalty for small blocks (<=16x16).
+  int reduction_fac =
+      (bsize <= BLOCK_16X16) ? ((bsize <= BLOCK_8X8) ? 4 : 2) : 0;
+
+  if (cpi->noise_estimate.enabled && cpi->noise_estimate.level == kHigh)
+    // Don't reduce intra cost penalty if estimated noise level is high.
+    reduction_fac = 0;
+
+  // Always use VPX_BITS_8 as input here because the penalty is applied
+  // to rate not distortion so we want a consistent penalty for all bit
+  // depths. If the actual bit depth were passed in here then the value
+  // retured by vp9_dc_quant() would scale with the bit depth and we would
+  // then need to apply inverse scaling to correct back to a bit depth
+  // independent rate penalty.
+  return (20 * vp9_dc_quant(qindex, qdelta, VPX_BITS_8)) >> reduction_fac;
 }
diff --git a/libvpx/vp9/encoder/vp9_rd.h b/libvpx/vp9/encoder/vp9_rd.h
index 1e1176866..59022c106 100644
--- a/libvpx/vp9/encoder/vp9_rd.h
+++ b/libvpx/vp9/encoder/vp9_rd.h
@@ -191,13 +191,18 @@ void vp9_setup_pred_block(const MACROBLOCKD *xd,
                           const struct scale_factors *scale,
                           const struct scale_factors *scale_uv);
 
-int vp9_get_intra_cost_penalty(int qindex, int qdelta,
-                               vpx_bit_depth_t bit_depth);
+int vp9_get_intra_cost_penalty(const struct VP9_COMP *const cpi,
+                               BLOCK_SIZE bsize, int qindex, int qdelta);
 
+unsigned int vp9_get_sby_variance(struct VP9_COMP *cpi,
+                                  const struct buf_2d *ref, BLOCK_SIZE bs);
 unsigned int vp9_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                            const struct buf_2d *ref,
                                            BLOCK_SIZE bs);
 #if CONFIG_VP9_HIGHBITDEPTH
+unsigned int vp9_high_get_sby_variance(struct VP9_COMP *cpi,
+                                       const struct buf_2d *ref, BLOCK_SIZE bs,
+                                       int bd);
 unsigned int vp9_high_get_sby_perpixel_variance(struct VP9_COMP *cpi,
                                                 const struct buf_2d *ref,
                                                 BLOCK_SIZE bs, int bd);
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index bf0fec3d8..2ba6378c5 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
-                                 32, NULL, 0, NULL, 0, bs, bs, xd->bd);
+                                 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
         if (xd->lossless) {
           vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
         } else {
@@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
         recon = CONVERT_TO_BYTEPTR(recon16);
       } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs);
+        vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
         switch (tx_size) {
           case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
           case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
@@ -632,7 +632,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
             // this is like vp9_short_idct4x4 but has a special case around
             // eob<=1, which is significant (not just an optimization) for
             // the lossless case.
-            x->itxm_add(dqcoeff, recon, 32, *eob);
+            x->inv_txfm_add(dqcoeff, recon, 32, *eob);
             break;
           default: assert(0 && "Invalid transform size"); break;
         }
@@ -730,7 +730,8 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
       }
     } else {
       // SKIP_TXFM_AC_DC
-      // skip forward transform
+      // skip forward transform. Because this is handled here, the quantization
+      // does not need to do it.
       x->plane[plane].eobs[block] = 0;
       sse = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
       dist = sse;
@@ -1576,8 +1577,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
-      x->fwd_txm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                    coeff, 8);
+      x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
+                     coeff, 8);
       vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
       thisdistortion += vp9_highbd_block_error_dispatch(
@@ -2875,57 +2876,82 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
 
 // This function is designed to apply a bias or adjustment to an rd value based
 // on the relative variance of the source and reconstruction.
-#define LOW_VAR_THRESH 16
-#define VLOW_ADJ_MAX 25
-#define VHIGH_ADJ_MAX 8
+#define VERY_LOW_VAR_THRESH 2
+#define LOW_VAR_THRESH 5
+#define VAR_MULT 100
+static unsigned int max_var_adjust[VP9E_CONTENT_INVALID] = { 16, 16, 100 };
+
 static void rd_variance_adjustment(VP9_COMP *cpi, MACROBLOCK *x,
                                    BLOCK_SIZE bsize, int64_t *this_rd,
                                    MV_REFERENCE_FRAME ref_frame,
                                    unsigned int source_variance) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int recon_variance;
+  unsigned int rec_variance;
+  unsigned int src_variance;
+  unsigned int src_rec_min;
   unsigned int absvar_diff = 0;
-  int64_t var_error = 0;
-  int64_t var_factor = 0;
+  unsigned int var_factor = 0;
+  unsigned int adj_max;
+  vp9e_tune_content content_type = cpi->oxcf.content;
 
   if (*this_rd == INT64_MAX) return;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    recon_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+    if (source_variance > 0) {
+      rec_variance = vp9_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
                                                         bsize, xd->bd);
+      src_variance = source_variance;
+    } else {
+      rec_variance =
+          vp9_high_get_sby_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
+      src_variance =
+          vp9_high_get_sby_variance(cpi, &x->plane[0].src, bsize, xd->bd);
+    }
   } else {
-    recon_variance =
-        vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+    if (source_variance > 0) {
+      rec_variance =
+          vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      src_variance = source_variance;
+    } else {
+      rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+      src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+    }
   }
 #else
-  recon_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  if (source_variance > 0) {
+    rec_variance = vp9_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+    src_variance = source_variance;
+  } else {
+    rec_variance = vp9_get_sby_variance(cpi, &xd->plane[0].dst, bsize);
+    src_variance = vp9_get_sby_variance(cpi, &x->plane[0].src, bsize);
+  }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
-    absvar_diff = (source_variance > recon_variance)
-                      ? (source_variance - recon_variance)
-                      : (recon_variance - source_variance);
+  // Lower of source (raw per pixel value) and recon variance. Note that
+  // if the source per pixel is 0 then the recon value here will not be per
+  // pixel (see above) so will likely be much larger.
+  src_rec_min = VPXMIN(source_variance, rec_variance);
 
-    var_error = ((int64_t)200 * source_variance * recon_variance) /
-                (((int64_t)source_variance * source_variance) +
-                 ((int64_t)recon_variance * recon_variance));
-    var_error = 100 - var_error;
-  }
+  if (src_rec_min > LOW_VAR_THRESH) return;
+
+  absvar_diff = (src_variance > rec_variance) ? (src_variance - rec_variance)
+                                              : (rec_variance - src_variance);
+
+  adj_max = max_var_adjust[content_type];
+
+  var_factor =
+      (unsigned int)((int64_t)VAR_MULT * absvar_diff) / VPXMAX(1, src_variance);
+  var_factor = VPXMIN(adj_max, var_factor);
 
-  // Source variance above a threshold and ref frame is intra.
-  // This case is targeted mainly at discouraging intra modes that give rise
-  // to a predictor with a low spatial complexity compared to the source.
-  if ((source_variance > LOW_VAR_THRESH) && (ref_frame == INTRA_FRAME) &&
-      (source_variance > recon_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VLOW_ADJ_MAX, var_error));
-    // A second possible case of interest is where the source variance
-    // is very low and we wish to discourage false texture or motion trails.
-  } else if ((source_variance < (LOW_VAR_THRESH >> 1)) &&
-             (recon_variance > source_variance)) {
-    var_factor = VPXMIN(absvar_diff, VPXMIN(VHIGH_ADJ_MAX, var_error));
-  }
   *this_rd += (*this_rd * var_factor) / 100;
+
+  if (content_type == VP9E_CONTENT_FILM) {
+    if (src_rec_min <= VERY_LOW_VAR_THRESH) {
+      if (ref_frame == INTRA_FRAME) *this_rd *= 2;
+      if (bsize > 6) *this_rd *= 2;
+    }
+  }
 }
 
 // Do we have an internal image edge (e.g. formatting bars).
@@ -3037,8 +3063,8 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
@@ -3801,8 +3827,8 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
-  const int intra_cost_penalty = vp9_get_intra_cost_penalty(
-      cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int intra_cost_penalty =
+      vp9_get_intra_cost_penalty(cpi, bsize, cm->base_qindex, cm->y_dc_delta_q);
   int_mv seg_mvs[4][MAX_REF_FRAMES];
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.c b/libvpx/vp9/encoder/vp9_skin_detection.c
index 3f3d48fb9..cc6c96776 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.c
+++ b/libvpx/vp9/encoder/vp9_skin_detection.c
@@ -15,75 +15,6 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_skin_detection.h"
 
-#define MODEL_MODE 1
-
-// Fixed-point skin color model parameters.
-static const int skin_mean[5][2] = { { 7463, 9614 },
-                                     { 6400, 10240 },
-                                     { 7040, 10240 },
-                                     { 8320, 9280 },
-                                     { 6800, 9614 } };
-static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
-static const int skin_threshold[6] = { 1570636, 1400000, 800000,
-                                       800000,  800000,  800000 };  // q18
-
-// Thresholds on luminance.
-static const int y_low = 40;
-static const int y_high = 220;
-
-// Evaluates the Mahalanobis distance measure for the input CbCr values.
-static int evaluate_skin_color_difference(int cb, int cr, int idx) {
-  const int cb_q6 = cb << 6;
-  const int cr_q6 = cr << 6;
-  const int cb_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
-  const int cbcr_diff_q12 =
-      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
-  const int cr_diff_q12 =
-      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
-  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
-  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
-  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
-  const int skin_diff =
-      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
-      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
-  return skin_diff;
-}
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion) {
-  if (y < y_low || y > y_high) {
-    return 0;
-  } else {
-    if (MODEL_MODE == 0) {
-      return (evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
-    } else {
-      int i = 0;
-      // Exit on grey.
-      if (cb == 128 && cr == 128) return 0;
-      // Exit on very strong cb.
-      if (cb > 150 && cr < 110) return 0;
-      for (; i < 5; i++) {
-        int skin_color_diff = evaluate_skin_color_difference(cb, cr, i);
-        if (skin_color_diff < skin_threshold[i + 1]) {
-          if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2))
-            return 0;
-          else if (motion == 0 &&
-                   skin_color_diff > (skin_threshold[i + 1] >> 1))
-            return 0;
-          else
-            return 1;
-        }
-        // Exit if difference is much large than the threshold.
-        if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
-          return 0;
-        }
-      }
-      return 0;
-    }
-  }
-}
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn) {
@@ -100,31 +31,113 @@ int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
     const uint8_t ysource = y[y_height_shift * stride + y_width_shift];
     const uint8_t usource = u[uv_height_shift * strideuv + uv_width_shift];
     const uint8_t vsource = v[uv_height_shift * strideuv + uv_width_shift];
+
     if (consec_zeromv > 25 && curr_motion_magn == 0) motion = 0;
-    return vp9_skin_pixel(ysource, usource, vsource, motion);
+    return vpx_skin_pixel(ysource, usource, vsource, motion);
+  }
+}
+
+void vp9_compute_skin_sb(VP9_COMP *const cpi, BLOCK_SIZE bsize, int mi_row,
+                         int mi_col) {
+  int i, j, num_bl;
+  VP9_COMMON *const cm = &cpi->common;
+  const uint8_t *src_y = cpi->Source->y_buffer;
+  const uint8_t *src_u = cpi->Source->u_buffer;
+  const uint8_t *src_v = cpi->Source->v_buffer;
+  const int src_ystride = cpi->Source->y_stride;
+  const int src_uvstride = cpi->Source->uv_stride;
+  const int y_bsize = 4 << b_width_log2_lookup[bsize];
+  const int uv_bsize = y_bsize >> 1;
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int shuv = shy - 1;
+  const int fac = y_bsize / 8;
+  const int y_shift = src_ystride * (mi_row << 3) + (mi_col << 3);
+  const int uv_shift = src_uvstride * (mi_row << 2) + (mi_col << 2);
+  const int mi_row_limit = VPXMIN(mi_row + 8, cm->mi_rows - 2);
+  const int mi_col_limit = VPXMIN(mi_col + 8, cm->mi_cols - 2);
+  src_y += y_shift;
+  src_u += uv_shift;
+  src_v += uv_shift;
+
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    num_bl = 0;
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int consec_zeromv = 0;
+      int bl_index = i * cm->mi_cols + j;
+      int bl_index1 = bl_index + 1;
+      int bl_index2 = bl_index + cm->mi_cols;
+      int bl_index3 = bl_index2 + 1;
+      // Don't detect skin on the boundary.
+      if (i == 0 || j == 0) continue;
+      if (bsize == BLOCK_8X8)
+        consec_zeromv = cpi->consec_zero_mv[bl_index];
+      else
+        consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index],
+                               VPXMIN(cpi->consec_zero_mv[bl_index1],
+                                      VPXMIN(cpi->consec_zero_mv[bl_index2],
+                                             cpi->consec_zero_mv[bl_index3])));
+      cpi->skin_map[bl_index] =
+          vp9_compute_skin_block(src_y, src_u, src_v, src_ystride, src_uvstride,
+                                 bsize, consec_zeromv, 0);
+      num_bl++;
+      src_y += y_bsize;
+      src_u += uv_bsize;
+      src_v += uv_bsize;
+    }
+    src_y += (src_ystride << shy) - (num_bl << shy);
+    src_u += (src_uvstride << shuv) - (num_bl << shuv);
+    src_v += (src_uvstride << shuv) - (num_bl << shuv);
+  }
+
+  // Remove isolated skin blocks (none of its neighbors are skin) and isolated
+  // non-skin blocks (all of its neighbors are skin).
+  // Skip 4 corner blocks which have only 3 neighbors to remove isolated skin
+  // blocks. Skip superblock borders to remove isolated non-skin blocks.
+  for (i = mi_row; i < mi_row_limit; i += fac) {
+    for (j = mi_col; j < mi_col_limit; j += fac) {
+      int bl_index = i * cm->mi_cols + j;
+      int num_neighbor = 0;
+      int mi, mj;
+      int non_skin_threshold = 8;
+      // Skip 4 corners.
+      if ((i == mi_row && (j == mi_col || j == mi_col_limit - fac)) ||
+          (i == mi_row_limit - fac && (j == mi_col || j == mi_col_limit - fac)))
+        continue;
+      // There are only 5 neighbors for non-skin blocks on the border.
+      if (i == mi_row || i == mi_row_limit - fac || j == mi_col ||
+          j == mi_col_limit - fac)
+        non_skin_threshold = 5;
+
+      for (mi = -fac; mi <= fac; mi += fac) {
+        for (mj = -fac; mj <= fac; mj += fac) {
+          if (i + mi >= mi_row && i + mi < mi_row_limit && j + mj >= mi_col &&
+              j + mj < mi_col_limit) {
+            int bl_neighbor_index = (i + mi) * cm->mi_cols + j + mj;
+            if (cpi->skin_map[bl_neighbor_index]) num_neighbor++;
+          }
+        }
+      }
+
+      if (cpi->skin_map[bl_index] && num_neighbor < 2)
+        cpi->skin_map[bl_index] = 0;
+      if (!cpi->skin_map[bl_index] && num_neighbor == non_skin_threshold)
+        cpi->skin_map[bl_index] = 1;
+    }
   }
 }
 
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
+void vp9_output_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   int i, j, mi_row, mi_col, num_bl;
   VP9_COMMON *const cm = &cpi->common;
   uint8_t *y;
   const uint8_t *src_y = cpi->Source->y_buffer;
-  const uint8_t *src_u = cpi->Source->u_buffer;
-  const uint8_t *src_v = cpi->Source->v_buffer;
   const int src_ystride = cpi->Source->y_stride;
-  const int src_uvstride = cpi->Source->uv_stride;
-  int y_bsize = 16;  // Use 8x8 or 16x16.
-  int uv_bsize = y_bsize >> 1;
-  int ypos = y_bsize >> 1;
-  int uvpos = uv_bsize >> 1;
-  int shy = (y_bsize == 8) ? 3 : 4;
-  int shuv = shy - 1;
-  int fac = y_bsize / 8;
-  // Use center pixel or average of center 2x2 pixels.
-  int mode_filter = 0;
+  const int y_bsize = 16;  // Use 8x8 or 16x16.
+  const int shy = (y_bsize == 8) ? 3 : 4;
+  const int fac = y_bsize / 8;
+
   YV12_BUFFER_CONFIG skinmap;
   memset(&skinmap, 0, sizeof(YV12_BUFFER_CONFIG));
   if (vpx_alloc_frame_buffer(&skinmap, cm->width, cm->height, cm->subsampling_x,
@@ -141,65 +154,21 @@ void vp9_compute_skin_map(VP9_COMP *const cpi, FILE *yuv_skinmap_file) {
   for (mi_row = 0; mi_row < cm->mi_rows - 1; mi_row += fac) {
     num_bl = 0;
     for (mi_col = 0; mi_col < cm->mi_cols - 1; mi_col += fac) {
-      int is_skin = 0;
-      if (mode_filter == 1) {
-        // Use 2x2 average at center.
-        uint8_t ysource = src_y[ypos * src_ystride + ypos];
-        uint8_t usource = src_u[uvpos * src_uvstride + uvpos];
-        uint8_t vsource = src_v[uvpos * src_uvstride + uvpos];
-        uint8_t ysource2 = src_y[(ypos + 1) * src_ystride + ypos];
-        uint8_t usource2 = src_u[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t vsource2 = src_v[(uvpos + 1) * src_uvstride + uvpos];
-        uint8_t ysource3 = src_y[ypos * src_ystride + (ypos + 1)];
-        uint8_t usource3 = src_u[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t vsource3 = src_v[uvpos * src_uvstride + (uvpos + 1)];
-        uint8_t ysource4 = src_y[(ypos + 1) * src_ystride + (ypos + 1)];
-        uint8_t usource4 = src_u[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        uint8_t vsource4 = src_v[(uvpos + 1) * src_uvstride + (uvpos + 1)];
-        ysource = (ysource + ysource2 + ysource3 + ysource4) >> 2;
-        usource = (usource + usource2 + usource3 + usource4) >> 2;
-        vsource = (vsource + vsource2 + vsource3 + vsource4) >> 2;
-        is_skin = vp9_skin_pixel(ysource, usource, vsource, 1);
-      } else {
-        int block_size = BLOCK_8X8;
-        int consec_zeromv = 0;
-        int bl_index = mi_row * cm->mi_cols + mi_col;
-        int bl_index1 = bl_index + 1;
-        int bl_index2 = bl_index + cm->mi_cols;
-        int bl_index3 = bl_index2 + 1;
-        if (y_bsize == 8)
-          consec_zeromv = cpi->consec_zero_mv[bl_index];
-        else
-          consec_zeromv =
-              VPXMIN(cpi->consec_zero_mv[bl_index],
-                     VPXMIN(cpi->consec_zero_mv[bl_index1],
-                            VPXMIN(cpi->consec_zero_mv[bl_index2],
-                                   cpi->consec_zero_mv[bl_index3])));
-        if (y_bsize == 16) block_size = BLOCK_16X16;
-        is_skin =
-            vp9_compute_skin_block(src_y, src_u, src_v, src_ystride,
-                                   src_uvstride, block_size, consec_zeromv, 0);
-      }
+      const int block_index = mi_row * cm->mi_cols + mi_col;
+      const int is_skin = cpi->skin_map[block_index];
       for (i = 0; i < y_bsize; i++) {
         for (j = 0; j < y_bsize; j++) {
-          if (is_skin)
-            y[i * src_ystride + j] = 255;
-          else
-            y[i * src_ystride + j] = src_y[i * src_ystride + j];
+          y[i * src_ystride + j] = is_skin ? 255 : src_y[i * src_ystride + j];
         }
       }
       num_bl++;
       y += y_bsize;
       src_y += y_bsize;
-      src_u += uv_bsize;
-      src_v += uv_bsize;
     }
     y += (src_ystride << shy) - (num_bl << shy);
     src_y += (src_ystride << shy) - (num_bl << shy);
-    src_u += (src_uvstride << shuv) - (num_bl << shuv);
-    src_v += (src_uvstride << shuv) - (num_bl << shuv);
   }
-  vp9_write_yuv_frame_420(&skinmap, yuv_skinmap_file);
+  vpx_write_yuv_frame(yuv_skinmap_file, &skinmap);
   vpx_free_frame_buffer(&skinmap);
 }
 #endif
diff --git a/libvpx/vp9/encoder/vp9_skin_detection.h b/libvpx/vp9/encoder/vp9_skin_detection.h
index c77382dbd..8880bff46 100644
--- a/libvpx/vp9/encoder/vp9_skin_detection.h
+++ b/libvpx/vp9/encoder/vp9_skin_detection.h
@@ -12,6 +12,8 @@
 #define VP9_ENCODER_VP9_SKIN_MAP_H_
 
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -19,19 +21,16 @@ extern "C" {
 
 struct VP9_COMP;
 
-// #define OUTPUT_YUV_SKINMAP
-
-int vp9_skin_pixel(const uint8_t y, const uint8_t cb, const uint8_t cr,
-                   int motion);
-
 int vp9_compute_skin_block(const uint8_t *y, const uint8_t *u, const uint8_t *v,
                            int stride, int strideuv, int bsize,
                            int consec_zeromv, int curr_motion_magn);
 
+void vp9_compute_skin_sb(struct VP9_COMP *const cpi, BLOCK_SIZE bsize,
+                         int mi_row, int mi_col);
+
 #ifdef OUTPUT_YUV_SKINMAP
 // For viewing skin map on input source.
-void vp9_compute_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
-extern void vp9_write_yuv_frame_420(YV12_BUFFER_CONFIG *s, FILE *f);
+void vp9_output_skin_map(struct VP9_COMP *const cpi, FILE *yuv_skinmap_file);
 #endif
 
 #ifdef __cplusplus
diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
index 8d9e2e8c3..a05db60c6 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libvpx/vp9/encoder/vp9_speed_features.c
@@ -157,6 +157,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
                                                          VP9_COMMON *cm,
                                                          SPEED_FEATURES *sf,
                                                          int speed) {
+  const VP9EncoderConfig *const oxcf = &cpi->oxcf;
   const int boosted = frame_is_boosted(cpi);
   int i;
 
@@ -182,7 +183,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 1) {
-    if (cpi->oxcf.pass == 2) {
+    if (oxcf->pass == 2) {
       TWO_PASS *const twopass = &cpi->twopass;
       if ((twopass->fr_content_type == FC_GRAPHICS_ANIMATION) ||
           vp9_internal_image_edge(cpi)) {
@@ -225,12 +226,16 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 2) {
-    sf->recode_loop = ALLOW_RECODE_KFARFGF;
+    if (oxcf->vbr_corpus_complexity)
+      sf->recode_loop = ALLOW_RECODE_FIRST;
+    else
+      sf->recode_loop = ALLOW_RECODE_KFARFGF;
+
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
     // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = cpi->oxcf.resize_mode != RESIZE_DYNAMIC ? 1 : 0;
+    sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
 
     sf->mode_search_skip_flags =
         (cm->frame_type == KEY_FRAME)
@@ -240,7 +245,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->disable_filter_search_var_thresh = 100;
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->allow_partition_search_skip = 1;
     sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
 
@@ -271,6 +275,7 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
     sf->adaptive_interp_filter_search = 1;
+    sf->allow_partition_search_skip = 1;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -364,6 +369,11 @@ static void set_rt_speed_feature_framesize_independent(
   sf->copy_partition_flag = 0;
   sf->use_source_sad = 0;
   sf->use_simple_block_yrd = 0;
+  sf->adapt_partition_source_sad = 0;
+  sf->use_altref_onepass = 0;
+  sf->use_compound_nonrd_pickmode = 0;
+  sf->nonrd_keyframe = 0;
+  sf->svc_use_lowres_part = 0;
 
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
@@ -441,6 +451,8 @@ static void set_rt_speed_feature_framesize_independent(
 
   if (speed >= 4) {
     int i;
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0)
+      sf->use_altref_onepass = 1;
     sf->last_partitioning_redo_frequency = 4;
     sf->adaptive_rd_thresh = 5;
     sf->use_fast_coef_costing = 0;
@@ -466,6 +478,7 @@ static void set_rt_speed_feature_framesize_independent(
   }
 
   if (speed >= 5) {
+    sf->use_altref_onepass = 0;
     sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size =
         is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX : STRICT_NEIGHBORING_MIN_MAX;
@@ -521,12 +534,30 @@ static void set_rt_speed_feature_framesize_independent(
   }
 
   if (speed >= 6) {
+    if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0) {
+      sf->use_altref_onepass = 1;
+      sf->use_compound_nonrd_pickmode = 1;
+    }
     sf->partition_search_type = VAR_BASED_PARTITION;
     // Turn on this to use non-RD key frame coding mode.
     sf->use_nonrd_pick_mode = 1;
     sf->mv.search_method = NSTEP;
     sf->mv.reduce_first_step_size = 1;
     sf->skip_encode_sb = 0;
+
+    if (!cpi->external_resize) sf->use_source_sad = 1;
+
+    if (sf->use_source_sad) {
+      sf->adapt_partition_source_sad = 1;
+      sf->adapt_partition_thresh =
+          (cm->width * cm->height <= 640 * 360) ? 40000 : 60000;
+      if (cpi->content_state_sb_fd == NULL &&
+          (!cpi->use_svc ||
+           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
+        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
+            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+      }
+    }
     if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
       // Enable short circuit for low temporal variance.
       sf->short_circuit_low_temp_var = 1;
@@ -534,53 +565,64 @@ static void set_rt_speed_feature_framesize_independent(
     if (cpi->svc.temporal_layer_id > 0) {
       sf->adaptive_rd_thresh = 4;
       sf->limit_newmv_early_exit = 0;
-      sf->mv.subpel_force_stop = (cpi->svc.temporal_layer_id == 1) ? 1 : 2;
-      sf->base_mv_aggressive =
-          (cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
-              ? 1
-              : 0;
+      sf->base_mv_aggressive = 1;
     }
   }
 
   if (speed >= 7) {
+    sf->adapt_partition_source_sad = 0;
     sf->adaptive_rd_thresh = 3;
     sf->mv.search_method = FAST_DIAMOND;
     sf->mv.fullpel_search_step_param = 10;
+    // For SVC: use better mv search on base temporal layer, and only
+    // on base spatial layer if highest resolution is above 640x360.
     if (cpi->svc.number_temporal_layers > 2 &&
-        cpi->svc.temporal_layer_id == 0) {
+        cpi->svc.temporal_layer_id == 0 &&
+        (cpi->svc.spatial_layer_id == 0 ||
+         cpi->oxcf.width * cpi->oxcf.height <= 640 * 360)) {
       sf->mv.search_method = NSTEP;
       sf->mv.fullpel_search_step_param = 6;
     }
-    if (!cpi->external_resize) sf->use_source_sad = 1;
-    if (sf->use_source_sad) {
-      if (cpi->content_state_sb_fd == NULL &&
-          (!cpi->use_svc ||
-           cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
-        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
-            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
-      }
+    if (cpi->svc.temporal_layer_id > 0 || cpi->svc.spatial_layer_id > 1) {
+      sf->use_simple_block_yrd = 1;
+      if (cpi->svc.non_reference_frame)
+        sf->mv.subpel_search_method = SUBPEL_TREE_PRUNED_EVENMORE;
     }
-  }
-
-  if (speed >= 8) {
-    sf->adaptive_rd_thresh = 4;
-    // Enable partition copy. For SVC, only enabled for top resolution layer,
+    if (cpi->use_svc && cpi->row_mt && cpi->oxcf.max_threads > 1)
+      sf->adaptive_rd_thresh_row_mt = 1;
+    // Enable partition copy. For SVC only enabled for top spatial resolution
+    // layer.
+    cpi->max_copied_frame = 0;
     if (!cpi->last_frame_dropped && cpi->resize_state == ORIG &&
         !cpi->external_resize &&
         (!cpi->use_svc ||
          cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
       sf->copy_partition_flag = 1;
-      cpi->max_copied_frame = 4;
+      cpi->max_copied_frame = 2;
+      // The top temporal enhancement layer (for number of temporal layers > 1)
+      // are non-reference frames, so use large/max value for max_copied_frame.
+      if (cpi->svc.number_temporal_layers > 1 &&
+          cpi->svc.temporal_layer_id == cpi->svc.number_temporal_layers - 1)
+        cpi->max_copied_frame = 255;
     }
+    // For SVC: enable use of lower resolution partition for higher resolution,
+    // only for 3 spatial layers and when config/top resolution is above VGA.
+    // Enable only for non-base temporal layer frames.
+    if (cpi->use_svc && cpi->svc.number_spatial_layers == 3 &&
+        cpi->svc.temporal_layer_id > 0 &&
+        cpi->oxcf.width * cpi->oxcf.height > 640 * 480)
+      sf->svc_use_lowres_part = 1;
+  }
 
+  if (speed >= 8) {
+    sf->adaptive_rd_thresh = 4;
+    sf->skip_encode_sb = 1;
+    sf->nonrd_keyframe = 1;
+    if (!cpi->use_svc) cpi->max_copied_frame = 4;
     if (cpi->row_mt && cpi->oxcf.max_threads > 1)
       sf->adaptive_rd_thresh_row_mt = 1;
 
-    if (content == VP9E_CONTENT_SCREEN)
-      sf->mv.subpel_force_stop = 3;
-    else if (cm->width * cm->height > 352 * 288)
-      sf->mv.subpel_force_stop = 2;
-
+    if (content == VP9E_CONTENT_SCREEN) sf->mv.subpel_force_stop = 3;
     if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF;
     // Only keep INTRA_DC mode for speed 8.
     if (!is_keyframe) {
@@ -610,6 +652,20 @@ static void set_rt_speed_feature_framesize_independent(
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
   }
+  if (sf->use_altref_onepass) {
+    if (cpi->rc.is_src_frame_alt_ref && cm->frame_type != KEY_FRAME) {
+      sf->partition_search_type = FIXED_PARTITION;
+      sf->always_this_block_size = BLOCK_64X64;
+    }
+    if (cpi->count_arf_frame_usage == NULL)
+      cpi->count_arf_frame_usage =
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_arf_frame_usage));
+    if (cpi->count_lastgolden_frame_usage == NULL)
+      cpi->count_lastgolden_frame_usage =
+          (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                                sizeof(*cpi->count_lastgolden_frame_usage));
+  }
 }
 
 void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
@@ -651,7 +707,8 @@ void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi) {
   // and multiple threads match.
   // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
   // adaptive_rd_thresh is defined per-row for non-rd pickmode.
-  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
     sf->adaptive_rd_thresh = 0;
 
   // This is only used in motion vector unit test.
@@ -768,7 +825,6 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   else if (oxcf->mode == GOOD)
     set_good_speed_feature_framesize_independent(cpi, cm, sf, oxcf->speed);
 
-  cpi->full_search_sad = vp9_full_search_sad;
   cpi->diamond_search_sad = vp9_diamond_search_sad;
 
   // Slow quant, dct and trellis not worthwhile for first pass
@@ -808,7 +864,8 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
   // and multiple threads match.
   // It can be used in realtime when adaptive_rd_thresh_row_mt is enabled since
   // adaptive_rd_thresh is defined per-row for non-rd pickmode.
-  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact)
+  if (!sf->adaptive_rd_thresh_row_mt && cpi->row_mt_bit_exact &&
+      oxcf->max_threads > 1)
     sf->adaptive_rd_thresh = 0;
 
   // This is only used in motion vector unit test.
diff --git a/libvpx/vp9/encoder/vp9_speed_features.h b/libvpx/vp9/encoder/vp9_speed_features.h
index ee485a35f..50d52bc23 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.h
+++ b/libvpx/vp9/encoder/vp9_speed_features.h
@@ -490,6 +490,24 @@ typedef struct SPEED_FEATURES {
   int use_source_sad;
 
   int use_simple_block_yrd;
+
+  // If source sad of superblock is high (> adapt_partition_thresh), will switch
+  // from VARIANCE_PARTITION to REFERENCE_PARTITION (which selects partition
+  // based on the nonrd-pickmode).
+  int adapt_partition_source_sad;
+  int adapt_partition_thresh;
+
+  // Enable use of alt-refs in 1 pass VBR.
+  int use_altref_onepass;
+
+  // Enable use of compound prediction, for nonrd_pickmode with nonzero lag.
+  int use_compound_nonrd_pickmode;
+
+  // Always use nonrd_pick_intra for all block sizes on keyframes.
+  int nonrd_keyframe;
+
+  // For SVC: enables use of partition from lower spatial resolution.
+  int svc_use_lowres_part;
 } SPEED_FEATURES;
 
 struct VP9_COMP;
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index 5867a6c38..2636bd9a5 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -36,6 +36,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
   svc->scaled_temp_is_alloc = 0;
   svc->scaled_one_half = 0;
   svc->current_superframe = 0;
+  svc->non_reference_frame = 0;
+
   for (i = 0; i < REF_FRAMES; ++i) svc->ref_frame_index[i] = -1;
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
     svc->ext_frame_flags[sl] = 0;
@@ -173,7 +175,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / spatial_layer_target;
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
         lrc->starting_buffer_level =
             (int64_t)(rc->starting_buffer_level * bitrate_alloc);
         lrc->optimal_buffer_level =
@@ -351,6 +353,7 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   }
 }
 
+#if !CONFIG_REALTIME_ONLY
 void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
@@ -366,6 +369,7 @@ void vp9_init_second_pass_spatial_svc(VP9_COMP *cpi) {
   }
   svc->spatial_layer_id = 0;
 }
+#endif  // !CONFIG_REALTIME_ONLY
 
 void vp9_inc_frame_in_layer(VP9_COMP *const cpi) {
   LAYER_CONTEXT *const lc =
@@ -386,9 +390,9 @@ int vp9_is_upper_layer_key_frame(const VP9_COMP *const cpi) {
              .is_key_frame;
 }
 
-static void get_layer_resolution(const int width_org, const int height_org,
-                                 const int num, const int den, int *width_out,
-                                 int *height_out) {
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out) {
   int w, h;
 
   if (width_out == NULL || height_out == NULL || den == 0) return;
@@ -603,6 +607,7 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
   LAYER_CONTEXT *lc = NULL;
   if (cpi->svc.number_spatial_layers > 1) cpi->svc.use_base_mv = 1;
   cpi->svc.force_zero_mode_spatial_ref = 1;
+  cpi->svc.mi_stride[cpi->svc.spatial_layer_id] = cpi->common.mi_stride;
 
   if (cpi->svc.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_0212) {
     set_flags_and_fb_idx_for_temporal_mode3(cpi);
@@ -652,9 +657,9 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
                        lc->scaling_factor_num, lc->scaling_factor_den, &width,
                        &height);
 
-  // For low resolutions: set phase of the filter = 8 (for symmetric averaging
-  // filter), use bilinear for now.
-  if (width <= 320 && height <= 240) {
+  // For resolutions <= VGA: set phase of the filter = 8 (for symmetric
+  // averaging filter), use bilinear for now.
+  if (width * height <= 640 * 480) {
     cpi->svc.downsample_filter_type[cpi->svc.spatial_layer_id] = BILINEAR;
     cpi->svc.downsample_filter_phase[cpi->svc.spatial_layer_id] = 8;
   }
@@ -677,6 +682,12 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     }
   }
 
+  cpi->svc.non_reference_frame = 0;
+  if (cpi->common.frame_type != KEY_FRAME && !cpi->ext_refresh_last_frame &&
+      !cpi->ext_refresh_golden_frame && !cpi->ext_refresh_alt_ref_frame) {
+    cpi->svc.non_reference_frame = 1;
+  }
+
   if (vp9_set_size_literal(cpi, width, height) != 0)
     return VPX_CODEC_INVALID_PARAM;
 
@@ -851,3 +862,28 @@ void vp9_svc_reset_key_frame(VP9_COMP *const cpi) {
   vp9_update_temporal_layer_framerate(cpi);
   vp9_restore_layer_context(cpi);
 }
+
+void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
+  SVC *svc = &cpi->svc;
+  int sl, tl;
+  for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
+    // Check for reset based on avg_frame_bandwidth for spatial layer sl.
+    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
+                                 svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    RATE_CONTROL *lrc = &lc->rc;
+    if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
+        lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
+      // Reset for all temporal layers with spatial layer sl.
+      for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
+        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        LAYER_CONTEXT *lc = &svc->layer_context[layer];
+        RATE_CONTROL *lrc = &lc->rc;
+        lrc->rc_1_frame = 0;
+        lrc->rc_2_frame = 0;
+        lrc->bits_off_target = lrc->optimal_buffer_level;
+        lrc->buffer_level = lrc->optimal_buffer_level;
+      }
+    }
+  }
+}
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index d8e6772b2..b7cdfd962 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -49,7 +49,7 @@ typedef struct {
   uint8_t speed;
 } LAYER_CONTEXT;
 
-typedef struct {
+typedef struct SVC {
   int spatial_layer_id;
   int temporal_layer_id;
   int number_spatial_layers;
@@ -87,6 +87,7 @@ typedef struct {
   int ref_frame_index[REF_FRAMES];
   int force_zero_mode_spatial_ref;
   int current_superframe;
+  int non_reference_frame;
   int use_base_mv;
   // Used to control the downscaling filter for source scaling, for 1 pass CBR.
   // downsample_filter_phase: = 0 will do sub-sampling (no weighted average),
@@ -95,6 +96,11 @@ typedef struct {
   // eighttap_smooth, eighttap_sharp, and bilinear.
   INTERP_FILTER downsample_filter_type[VPX_SS_MAX_LAYERS];
   int downsample_filter_phase[VPX_SS_MAX_LAYERS];
+
+  BLOCK_SIZE *prev_partition_svc;
+  int mi_stride[VPX_MAX_LAYERS];
+
+  int first_layer_denoise;
 } SVC;
 
 struct VP9_COMP;
@@ -124,6 +130,10 @@ void vp9_save_layer_context(struct VP9_COMP *const cpi);
 // Initialize second pass rc for spatial svc.
 void vp9_init_second_pass_spatial_svc(struct VP9_COMP *cpi);
 
+void get_layer_resolution(const int width_org, const int height_org,
+                          const int num, const int den, int *width_out,
+                          int *height_out);
+
 // Increment number of video frames in layer
 void vp9_inc_frame_in_layer(struct VP9_COMP *const cpi);
 
@@ -144,6 +154,8 @@ void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
 void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
 
+void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 630794156..2758c42ae 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -350,6 +350,27 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
     td->mb.mv_limits.col_max =
         ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND);
 
+    if (cpi->oxcf.content == VP9E_CONTENT_FILM) {
+      unsigned int src_variance;
+      struct buf_2d src;
+
+      src.buf = f->y_buffer + mb_y_offset;
+      src.stride = f->y_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (mbd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        src_variance =
+            vp9_high_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16, mbd->bd);
+      } else {
+        src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+      }
+#else
+      src_variance = vp9_get_sby_perpixel_variance(cpi, &src, BLOCK_16X16);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      if (src_variance <= 2) strength = VPXMAX(0, (int)strength - 2);
+    }
+
     for (frame = 0; frame < frame_count; frame++) {
       const uint32_t thresh_low = 10000;
       const uint32_t thresh_high = 20000;
diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
index be4cd8685..460dab659 100644
--- a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -11,6 +11,7 @@
 #include <assert.h>
 #include <smmintrin.h>
 
+#include "./vp9_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 09a1e48fc..dbd243ac1 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
 
@@ -71,7 +72,7 @@ static INLINE void transpose_4x4(__m128i *res) {
 }
 
 static void fdct4_sse2(__m128i *in) {
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
@@ -193,7 +194,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride,
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -706,61 +707,9 @@ static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
   store_output(&res[7], (output + 7 * stride));
 }
 
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
-  // 00 10 01 11 02 12 03 13
-  // 20 30 21 31 22 32 23 33
-  // 04 14 05 15 06 16 07 17
-  // 24 34 25 35 26 36 27 37
-  // 40 50 41 51 42 52 43 53
-  // 60 70 61 71 62 72 63 73
-  // 44 54 45 55 46 56 47 57
-  // 64 74 65 75 66 76 67 77
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-  // 00 10 20 30 01 11 21 31
-  // 40 50 60 70 41 51 61 71
-  // 02 12 22 32 03 13 23 33
-  // 42 52 62 72 43 53 63 73
-  // 04 14 24 34 05 15 25 35
-  // 44 54 64 74 45 55 65 75
-  // 06 16 26 36 07 17 27 37
-  // 46 56 66 76 47 57 67 77
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
-  // 00 10 20 30 40 50 60 70
-  // 01 11 21 31 41 51 61 71
-  // 02 12 22 32 42 52 62 72
-  // 03 13 23 33 43 53 63 73
-  // 04 14 24 34 44 54 64 74
-  // 05 15 25 35 45 55 65 75
-  // 06 16 26 36 46 56 66 76
-  // 07 17 27 37 47 57 67 77
-}
-
 static void fdct8_sse2(__m128i *in) {
   // constants
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -895,7 +844,7 @@ static void fdct8_sse2(__m128i *in) {
   in[7] = _mm_packs_epi32(v6, v7);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 static void fadst8_sse2(__m128i *in) {
@@ -912,7 +861,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -1125,7 +1074,7 @@ static void fadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 }
 
 void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -1182,23 +1131,6 @@ static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
   write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
-}
-
 static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
   // perform rounding operations
   right_shift_8x8(res0, 2);
@@ -1210,7 +1142,7 @@ static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
 static void fdct16_8col(__m128i *in) {
   // perform 16x16 1-D DCT for 8 columns
   __m128i i[8], s[8], p[8], t[8], u[16], v[16];
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
@@ -1557,8 +1489,8 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -2002,13 +1934,13 @@ static void fadst16_8col(__m128i *in) {
 static void fdct16_sse2(__m128i *in0, __m128i *in1) {
   fdct16_8col(in0);
   fdct16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 static void fadst16_sse2(__m128i *in0, __m128i *in1) {
   fadst16_8col(in0);
   fadst16_8col(in1);
-  array_transpose_16x16(in0, in1);
+  transpose_16bit_16x16(in0, in1);
 }
 
 void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
index db57ee1f1..bf874a09e 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -31,7 +31,7 @@ void vp9_fdct8x8_quant_ssse3(
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
   const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
index 91d0602f9..5930bf491 100644
--- a/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_denoiser_sse2.c
@@ -13,7 +13,6 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 
-#include "vpx_ports/emmintrin_compat.h"
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/encoder/vp9_context_tree.h"
diff --git a/libvpx/vp9/encoder/x86/vp9_error_avx2.c b/libvpx/vp9/encoder/x86/vp9_error_avx2.c
index e228bd8b7..99fef31d1 100644
--- a/libvpx/vp9/encoder/x86/vp9_error_avx2.c
+++ b/libvpx/vp9/encoder/x86/vp9_error_avx2.c
@@ -1,7 +1,7 @@
 /*
  *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
  *
- *  Usee of this source code is governed by a BSD-style license
+ *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
@@ -105,3 +105,57 @@ int64_t vp9_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,
   _mm_storel_epi64((__m128i *)(ssz), ssz_128);
   return sse;
 }
+
+int64_t vp9_block_error_fp_avx2(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  int i;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i sse_256 = zero;
+  __m256i sse_hi;
+  __m128i sse_128;
+  int64_t sse;
+
+  if (block_size == 16) {
+    // Load 16 elements for coeff and dqcoeff.
+    const __m256i _coeff = load_tran_low(coeff);
+    const __m256i _dqcoeff = load_tran_low(dqcoeff);
+    // dqcoeff - coeff
+    const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+    // madd (dqcoeff - coeff)
+    const __m256i error_lo = _mm256_madd_epi16(diff, diff);
+    // Save the higher 64 bit of each 128 bit lane.
+    const __m256i error_hi = _mm256_srli_si256(error_lo, 8);
+    // Add the higher 64 bit to the low 64 bit.
+    const __m256i error = _mm256_add_epi32(error_lo, error_hi);
+    // Expand each double word in the lower 64 bits to quad word.
+    sse_256 = _mm256_unpacklo_epi32(error, zero);
+  } else {
+    for (i = 0; i < block_size; i += 16) {
+      // Load 16 elements for coeff and dqcoeff.
+      const __m256i _coeff = load_tran_low(coeff);
+      const __m256i _dqcoeff = load_tran_low(dqcoeff);
+      const __m256i diff = _mm256_sub_epi16(_dqcoeff, _coeff);
+      const __m256i error = _mm256_madd_epi16(diff, diff);
+      // Expand each double word of madd (dqcoeff - coeff) to quad word.
+      const __m256i exp_error_lo = _mm256_unpacklo_epi32(error, zero);
+      const __m256i exp_error_hi = _mm256_unpackhi_epi32(error, zero);
+      // Add each quad word of madd (dqcoeff - coeff).
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_lo);
+      sse_256 = _mm256_add_epi64(sse_256, exp_error_hi);
+      coeff += 16;
+      dqcoeff += 16;
+    }
+  }
+  // Save the higher 64 bit of each 128 bit lane.
+  sse_hi = _mm256_srli_si256(sse_256, 8);
+  // Add the higher 64 bit to the low 64 bit.
+  sse_256 = _mm256_add_epi64(sse_256, sse_hi);
+
+  // Add each 64 bit from each of the 128 bit lane of the 256 bit.
+  sse_128 = _mm_add_epi64(_mm256_castsi256_si128(sse_256),
+                          _mm256_extractf128_si256(sse_256, 1));
+
+  // Store the results.
+  _mm_storel_epi64((__m128i *)&sse, sse_128);
+  return sse;
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index b53714a02..7685e7bc3 100644
--- a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -13,159 +13,738 @@
 #include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_scale/yv12config.h"
 
-extern void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
-                                         YV12_BUFFER_CONFIG *dst,
-                                         uint8_t filter_type, int phase_scaler);
+static INLINE __m128i scale_plane_2_to_1_phase_0_kernel(
+    const uint8_t *const src, const __m128i *const mask) {
+  const __m128i a = _mm_loadu_si128((const __m128i *)(&src[0]));
+  const __m128i b = _mm_loadu_si128((const __m128i *)(&src[16]));
+  const __m128i a_and = _mm_and_si128(a, *mask);
+  const __m128i b_and = _mm_and_si128(b, *mask);
+  return _mm_packus_epi16(a_and, b_and);
+}
 
-static void downsample_2_to_1_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride, int w,
-                                    int h) {
+static void scale_plane_2_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
   const __m128i mask = _mm_set1_epi16(0x00FF);
-  const int max_width = w & ~15;
-  int y;
-  for (y = 0; y < h; ++y) {
-    int x;
-    for (x = 0; x < max_width; x += 16) {
-      const __m128i a = _mm_loadu_si128((const __m128i *)(src + x * 2 + 0));
-      const __m128i b = _mm_loadu_si128((const __m128i *)(src + x * 2 + 16));
-      const __m128i a_and = _mm_and_si128(a, mask);
-      const __m128i b_and = _mm_and_si128(b, mask);
-      const __m128i c = _mm_packus_epi16(a_and, b_and);
-      _mm_storeu_si128((__m128i *)(dst + x), c);
-    }
-    for (; x < w; ++x) dst[x] = src[x * 2];
-    src += src_stride * 2;
-    dst += dst_stride;
-  }
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d = scale_plane_2_to_1_phase_0_kernel(src, &mask);
+      _mm_storeu_si128((__m128i *)dst, d);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
 }
 
-static INLINE __m128i filter(const __m128i *const a, const __m128i *const b,
-                             const __m128i *const c, const __m128i *const d,
-                             const __m128i *const e, const __m128i *const f,
-                             const __m128i *const g, const __m128i *const h) {
-  const __m128i coeffs_ab =
-      _mm_set_epi8(6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1, 6, -1);
-  const __m128i coeffs_cd = _mm_set_epi8(78, -19, 78, -19, 78, -19, 78, -19, 78,
-                                         -19, 78, -19, 78, -19, 78, -19);
-  const __m128i const64_x16 = _mm_set1_epi16(64);
-  const __m128i ab = _mm_unpacklo_epi8(*a, *b);
-  const __m128i cd = _mm_unpacklo_epi8(*c, *d);
-  const __m128i fe = _mm_unpacklo_epi8(*f, *e);
-  const __m128i hg = _mm_unpacklo_epi8(*h, *g);
-  const __m128i ab_terms = _mm_maddubs_epi16(ab, coeffs_ab);
-  const __m128i cd_terms = _mm_maddubs_epi16(cd, coeffs_cd);
-  const __m128i fe_terms = _mm_maddubs_epi16(fe, coeffs_cd);
-  const __m128i hg_terms = _mm_maddubs_epi16(hg, coeffs_ab);
-  // can not overflow
-  const __m128i abcd_terms = _mm_add_epi16(ab_terms, cd_terms);
-  // can not overflow
-  const __m128i fehg_terms = _mm_add_epi16(fe_terms, hg_terms);
-  // can overflow, use saturating add
-  const __m128i terms = _mm_adds_epi16(abcd_terms, fehg_terms);
-  const __m128i round = _mm_adds_epi16(terms, const64_x16);
-  const __m128i shift = _mm_srai_epi16(round, 7);
-  return _mm_packus_epi16(shift, shift);
+static void scale_plane_4_to_1_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int dst_w, const int dst_h) {
+  const int max_width = (dst_w + 15) & ~15;
+  const __m128i mask = _mm_set1_epi32(0x000000FF);
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      const __m128i d0 = scale_plane_2_to_1_phase_0_kernel(&src[0], &mask);
+      const __m128i d1 = scale_plane_2_to_1_phase_0_kernel(&src[32], &mask);
+      const __m128i d2 = _mm_packus_epi16(d0, d1);
+      _mm_storeu_si128((__m128i *)dst, d2);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
 }
 
-static void eight_tap_row_ssse3(const uint8_t *src, uint8_t *dst, int w) {
-  const int max_width = w & ~7;
-  int x = 0;
-  for (; x < max_width; x += 8) {
-    const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x + 0));
-    const __m128i b = _mm_loadl_epi64((const __m128i *)(src + x + 1));
-    const __m128i c = _mm_loadl_epi64((const __m128i *)(src + x + 2));
-    const __m128i d = _mm_loadl_epi64((const __m128i *)(src + x + 3));
-    const __m128i e = _mm_loadl_epi64((const __m128i *)(src + x + 4));
-    const __m128i f = _mm_loadl_epi64((const __m128i *)(src + x + 5));
-    const __m128i g = _mm_loadl_epi64((const __m128i *)(src + x + 6));
-    const __m128i h = _mm_loadl_epi64((const __m128i *)(src + x + 7));
-    const __m128i pack = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-    _mm_storel_epi64((__m128i *)(dst + x), pack);
-  }
+static INLINE __m128i scale_plane_bilinear_kernel(const __m128i *const s,
+                                                  const __m128i c0c1) {
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i t0 = _mm_maddubs_epi16(s[0], c0c1);
+  const __m128i t1 = _mm_maddubs_epi16(s[1], c0c1);
+  // round and shift by 7 bit each 16 bit
+  const __m128i t2 = _mm_adds_epi16(t0, k_64);
+  const __m128i t3 = _mm_adds_epi16(t1, k_64);
+  const __m128i t4 = _mm_srai_epi16(t2, 7);
+  const __m128i t5 = _mm_srai_epi16(t3, 7);
+  return _mm_packus_epi16(t4, t5);
 }
 
-static void upsample_1_to_2_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride, int dst_w,
-                                  int dst_h) {
-  dst_w /= 2;
-  dst_h /= 2;
-  {
-    DECLARE_ALIGNED(16, uint8_t, tmp[1920 * 8]);
-    uint8_t *tmp0 = tmp + dst_w * 0;
-    uint8_t *tmp1 = tmp + dst_w * 1;
-    uint8_t *tmp2 = tmp + dst_w * 2;
-    uint8_t *tmp3 = tmp + dst_w * 3;
-    uint8_t *tmp4 = tmp + dst_w * 4;
-    uint8_t *tmp5 = tmp + dst_w * 5;
-    uint8_t *tmp6 = tmp + dst_w * 6;
-    uint8_t *tmp7 = tmp + dst_w * 7;
-    uint8_t *tmp8 = NULL;
-    const int max_width = dst_w & ~7;
-    int y;
-    eight_tap_row_ssse3(src - src_stride * 3 - 3, tmp0, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 2 - 3, tmp1, dst_w);
-    eight_tap_row_ssse3(src - src_stride * 1 - 3, tmp2, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 0 - 3, tmp3, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 1 - 3, tmp4, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 2 - 3, tmp5, dst_w);
-    eight_tap_row_ssse3(src + src_stride * 3 - 3, tmp6, dst_w);
-    for (y = 0; y < dst_h; y++) {
-      int x;
-      eight_tap_row_ssse3(src + src_stride * 4 - 3, tmp7, dst_w);
-      for (x = 0; x < max_width; x += 8) {
-        const __m128i A = _mm_loadl_epi64((const __m128i *)(src + x));
-        const __m128i B = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-        const __m128i AB = _mm_unpacklo_epi8(A, B);
-        __m128i C, D, CD;
-        _mm_storeu_si128((__m128i *)(dst + x * 2), AB);
-        {
-          const __m128i a =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 3));
-          const __m128i b =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 2));
-          const __m128i c =
-              _mm_loadl_epi64((const __m128i *)(src + x - src_stride * 1));
-          const __m128i d =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 0));
-          const __m128i e =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 1));
-          const __m128i f =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 2));
-          const __m128i g =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 3));
-          const __m128i h =
-              _mm_loadl_epi64((const __m128i *)(src + x + src_stride * 4));
-          C = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        {
-          const __m128i a = _mm_loadl_epi64((const __m128i *)(tmp0 + x));
-          const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp1 + x));
-          const __m128i c = _mm_loadl_epi64((const __m128i *)(tmp2 + x));
-          const __m128i d = _mm_loadl_epi64((const __m128i *)(tmp3 + x));
-          const __m128i e = _mm_loadl_epi64((const __m128i *)(tmp4 + x));
-          const __m128i f = _mm_loadl_epi64((const __m128i *)(tmp5 + x));
-          const __m128i g = _mm_loadl_epi64((const __m128i *)(tmp6 + x));
-          const __m128i h = _mm_loadl_epi64((const __m128i *)(tmp7 + x));
-          D = filter(&a, &b, &c, &d, &e, &f, &g, &h);
-        }
-        CD = _mm_unpacklo_epi8(C, D);
-        _mm_storeu_si128((__m128i *)(dst + x * 2 + dst_stride), CD);
-      }
-      src += src_stride;
-      dst += dst_stride * 2;
-      tmp8 = tmp0;
-      tmp0 = tmp1;
-      tmp1 = tmp2;
-      tmp2 = tmp3;
-      tmp3 = tmp4;
-      tmp4 = tmp5;
-      tmp5 = tmp6;
-      tmp6 = tmp7;
-      tmp7 = tmp8;
+static void scale_plane_2_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[2], d[2];
+
+      // Horizontal
+      // Even rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + 16));
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // odd rows
+      s[0] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[1] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      d[1] = scale_plane_bilinear_kernel(s, c0c1);
+
+      // Vertical
+      s[0] = _mm_unpacklo_epi8(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi8(d[0], d[1]);
+      d[0] = scale_plane_bilinear_kernel(s, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 32;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 2 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_4_to_1_bilinear(const uint8_t *src,
+                                        const ptrdiff_t src_stride,
+                                        uint8_t *dst,
+                                        const ptrdiff_t dst_stride,
+                                        const int dst_w, const int dst_h,
+                                        const __m128i c0c1) {
+  const int max_width = (dst_w + 15) & ~15;
+  int y = dst_h;
+
+  do {
+    int x = max_width;
+    do {
+      __m128i s[8], d[8];
+
+      // Note: Using _mm_packus_epi32() in SSE4.1 could be faster.
+      //       Here we tried to not use shuffle instructions which would be slow
+      //       on some x86 CPUs.
+
+      // Horizontal
+      // 000 001 xx xx 004 005 xx xx  008 009 xx xx 00C 00D xx xx
+      // 010 011 xx xx 014 015 xx xx  018 019 xx xx 01C 01D xx xx
+      // 020 021 xx xx 024 025 xx xx  028 029 xx xx 02C 02D xx xx
+      // 030 031 xx xx 034 035 xx xx  038 039 xx xx 03C 03D xx xx
+      // 100 101 xx xx 104 105 xx xx  108 109 xx xx 10C 10D xx xx
+      // 110 111 xx xx 114 115 xx xx  118 119 xx xx 11C 11D xx xx
+      // 120 121 xx xx 124 125 xx xx  128 129 xx xx 12C 12D xx xx
+      // 130 131 xx xx 134 135 xx xx  138 139 xx xx 13C 13D xx xx
+      s[0] = _mm_loadu_si128((const __m128i *)(&src[0]));
+      s[1] = _mm_loadu_si128((const __m128i *)(&src[16]));
+      s[2] = _mm_loadu_si128((const __m128i *)(&src[32]));
+      s[3] = _mm_loadu_si128((const __m128i *)(&src[48]));
+      s[4] = _mm_loadu_si128((const __m128i *)(src + src_stride + 0));
+      s[5] = _mm_loadu_si128((const __m128i *)(src + src_stride + 16));
+      s[6] = _mm_loadu_si128((const __m128i *)(src + src_stride + 32));
+      s[7] = _mm_loadu_si128((const __m128i *)(src + src_stride + 48));
+
+      // 000 001 100 101 xx xx xx xx  004 005 104 105 xx xx xx xx
+      // 008 009 108 109 xx xx xx xx  00C 00D 10C 10D xx xx xx xx
+      // 010 011 110 111 xx xx xx xx  014 015 114 115 xx xx xx xx
+      // 018 019 118 119 xx xx xx xx  01C 01D 11C 11D xx xx xx xx
+      // 020 021 120 121 xx xx xx xx  024 025 124 125 xx xx xx xx
+      // 028 029 128 129 xx xx xx xx  02C 02D 12C 12D xx xx xx xx
+      // 030 031 130 131 xx xx xx xx  034 035 134 135 xx xx xx xx
+      // 038 039 138 139 xx xx xx xx  03C 03D 13C 13D xx xx xx xx
+      d[0] = _mm_unpacklo_epi16(s[0], s[4]);
+      d[1] = _mm_unpackhi_epi16(s[0], s[4]);
+      d[2] = _mm_unpacklo_epi16(s[1], s[5]);
+      d[3] = _mm_unpackhi_epi16(s[1], s[5]);
+      d[4] = _mm_unpacklo_epi16(s[2], s[6]);
+      d[5] = _mm_unpackhi_epi16(s[2], s[6]);
+      d[6] = _mm_unpacklo_epi16(s[3], s[7]);
+      d[7] = _mm_unpackhi_epi16(s[3], s[7]);
+
+      // 000 001 100 101 008 009 108 109  xx xx xx xx xx xx xx xx
+      // 004 005 104 105 00C 00D 10C 10D  xx xx xx xx xx xx xx xx
+      // 010 011 110 111 018 019 118 119  xx xx xx xx xx xx xx xx
+      // 014 015 114 115 01C 01D 11C 11D  xx xx xx xx xx xx xx xx
+      // 020 021 120 121 028 029 128 129  xx xx xx xx xx xx xx xx
+      // 024 025 124 125 02C 02D 12C 12D  xx xx xx xx xx xx xx xx
+      // 030 031 130 131 038 039 138 139  xx xx xx xx xx xx xx xx
+      // 034 035 134 135 03C 03D 13C 13D  xx xx xx xx xx xx xx xx
+      s[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      s[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      s[2] = _mm_unpacklo_epi32(d[2], d[3]);
+      s[3] = _mm_unpackhi_epi32(d[2], d[3]);
+      s[4] = _mm_unpacklo_epi32(d[4], d[5]);
+      s[5] = _mm_unpackhi_epi32(d[4], d[5]);
+      s[6] = _mm_unpacklo_epi32(d[6], d[7]);
+      s[7] = _mm_unpackhi_epi32(d[6], d[7]);
+
+      // 000 001 100 101 004 005 104 105  008 009 108 109 00C 00D 10C 10D
+      // 010 011 110 111 014 015 114 115  018 019 118 119 01C 01D 11C 11D
+      // 020 021 120 121 024 025 124 125  028 029 128 129 02C 02D 12C 12D
+      // 030 031 130 131 034 035 134 135  038 039 138 139 03C 03D 13C 13D
+      d[0] = _mm_unpacklo_epi32(s[0], s[1]);
+      d[1] = _mm_unpacklo_epi32(s[2], s[3]);
+      d[2] = _mm_unpacklo_epi32(s[4], s[5]);
+      d[3] = _mm_unpacklo_epi32(s[6], s[7]);
+
+      d[0] = scale_plane_bilinear_kernel(&d[0], c0c1);
+      d[1] = scale_plane_bilinear_kernel(&d[2], c0c1);
+
+      // Vertical
+      d[0] = scale_plane_bilinear_kernel(d, c0c1);
+
+      _mm_storeu_si128((__m128i *)dst, d[0]);
+      src += 64;
+      dst += 16;
+      x -= 16;
+    } while (x);
+    src += 4 * (src_stride - max_width);
+    dst += dst_stride - max_width;
+  } while (--y);
+}
+
+static void scale_plane_2_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 3) & ~3;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (2 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 3) & ~3;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 1;
+
+  // horizontal 4x8
+  do {
+    load_8bit_8x8(src + 2, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[3]);
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // 0C 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      transpose_16bit_4x8(&s[3], &s[3]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 01 11 21 31 41 51 61 71
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 02 12 22 32 42 52 62 72
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 03 13 23 33 43 53 63 73
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      d[0] = _mm_packus_epi16(d[0], d[2]);
+      d[1] = _mm_packus_epi16(d[1], d[3]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      d[2] = _mm_unpacklo_epi16(d[0], d[1]);
+      d[3] = _mm_unpackhi_epi16(d[0], d[1]);
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      d[0] = _mm_unpacklo_epi32(d[2], d[3]);
+      d[1] = _mm_unpackhi_epi32(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      t += 8;
+      x -= 4;
+    } while (x);
+    src += 8 * src_stride - 2 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x4
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    s[2] = _mm_loadu_si128((const __m128i *)(t + 4 * width_hor));
+    t += 6 * width_hor;
+    y = height_ver;
+
+    do {
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[3]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[1], f);  // 10 11 12 13 14 15 16 17
+      d[2] = convolve8_8_ssse3(&s[2], f);  // 20 21 22 23 24 25 26 27
+      d[3] = convolve8_8_ssse3(&s[3], f);  // 30 31 32 33 34 35 36 37
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[1] = _mm_packus_epi16(d[2], d[3]);
+      store_8bit_8x4_from_16x2(d, dst, dst_stride);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+
+      dst += 4 * dst_stride;
+      y -= 4;
+    } while (y);
+    t -= width_hor * (2 * height_ver + 6);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  const int width_hor = (w + 1) & ~1;
+  const int width_ver = (w + 7) & ~7;
+  const int height_hor = (4 * h + SUBPEL_TAPS - 2 + 7) & ~7;
+  const int height_ver = (h + 1) & ~1;
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[11], d[4];
+  __m128i f[4];
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef, f);
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 + 3;
+
+  // horizontal 2x8
+  do {
+    load_8bit_8x8(src + 4, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75 (overlapped)
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77 (overlapped)
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[2]);
+      // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+      // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      transpose_16bit_4x8(&s[2], &s[2]);
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 10 20 30 40 50 60 70
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 01 11 21 31 41 51 61 71
+
+      // 00 10 20 30 40 50 60 70  xx xx xx xx xx xx xx xx
+      // 01 11 21 31 41 51 61 71  xx xx xx xx xx xx xx xx
+      d[0] = _mm_packus_epi16(d[0], d[0]);
+      d[1] = _mm_packus_epi16(d[1], d[1]);
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      d[0] = _mm_unpacklo_epi16(d[0], d[1]);
+      store_8bit_4x4_sse2(d[0], t, 2 * width_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      t += 4;
+      x -= 2;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor;
+    t += 6 * width_hor;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x2
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    s[0] = _mm_loadu_si128((const __m128i *)(t + 0 * width_hor));
+    s[1] = _mm_loadu_si128((const __m128i *)(t + 2 * width_hor));
+    t += 4 * width_hor;
+    y = height_ver;
+
+    do {
+      // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+      // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 77
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 77
+      loadu_8bit_16x4(t, 2 * width_hor, &s[2]);
+      t += 8 * width_hor;
+
+      d[0] = convolve8_8_ssse3(&s[0], f);  // 00 01 02 03 04 05 06 07
+      d[1] = convolve8_8_ssse3(&s[2], f);  // 10 11 12 13 14 15 16 17
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+
+      dst += 2 * dst_stride;
+      y -= 2;
+    } while (y);
+    t -= width_hor * (4 * height_ver + 4);
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+                                     __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+                                   const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+                                       uint8_t *dst, const int dst_stride,
+                                       const int w, const int h,
+                                       const InterpKernel *const coef,
+                                       const int phase_scaler,
+                                       uint8_t *const temp_buffer) {
+  static const int step_q4 = 16 * 4 / 3;
+  const int width_hor = (w + 5) - ((w + 5) % 6);
+  const int stride_hor = 2 * width_hor + 4;  // store 4 extra pixels
+  const int width_ver = (w + 7) & ~7;
+  // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+  // above and (SUBPEL_TAPS / 2) extra rows below.
+  const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+  const int height_ver = (h + 5) - ((h + 5) % 6);
+  int x, y = height_hor;
+  uint8_t *t = temp_buffer;
+  __m128i s[12], d[6], dd[4];
+  __m128i f0[4], f1[5], f2[5];
+  // The offset of the first row is always less than 1 pixel.
+  const int offset1_q4 = phase_scaler + 1 * step_q4;
+  const int offset2_q4 = phase_scaler + 2 * step_q4;
+  // offset_idxx indicates the pixel offset is even (0) or odd (1).
+  // It's used to choose the src offset and filter coefficient offset.
+  const int offset_idx1 = (offset1_q4 >> 4) & 1;
+  const int offset_idx2 = (offset2_q4 >> 4) & 1;
+  static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+    shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+  };
+  static const convolve8_funcs convolve8_funcs[2] = {
+    convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+  };
+
+  assert(w && h);
+
+  shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+  shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+  // Sub 64 to avoid overflow.
+  // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+  // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+  // When filter phase idx is 1, the two biggest coefficients are shuffled
+  // together, and the sum of them are always no less than 128. Sub 64 here.
+  // After the subtraction, when the sum of all positive coefficients are no
+  // larger than 128, and the sum of all negative coefficients are no
+  // less than -128, there will be no overflow in the convolve8 functions.
+  f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+  f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+  f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+  src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+  // horizontal 6x8
+  do {
+    load_8bit_8x8(src, src_stride, s);
+    // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+    // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+    // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+    // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+    transpose_16bit_4x8(s, s);
+    x = width_hor;
+
+    do {
+      src += 8;
+      load_8bit_8x8(src, src_stride, &s[4]);
+      // 08 09 18 19 28 29 38 39  48 49 58 59 68 69 78 79
+      // 0A 0B 1A 1B 2A 2B 3A 3B  4A 4B 5A 5B 6A 6B 7A 7B
+      // OC 0D 1C 1D 2C 2D 3C 3D  4C 4D 5C 5D 6C 6D 7C 7D
+      // 0E 0F 1E 1F 2E 2F 3E 3F  4E 4F 5E 5F 6E 6F 7E 7F
+      transpose_16bit_4x8(&s[4], &s[4]);
+
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
+      // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74  xx xx xx xx xx xx xx xx
+      // 05 15 25 35 45 55 65 75  xx xx xx xx xx xx xx xx
+      dd[0] = _mm_packus_epi16(d[0], d[2]);
+      dd[1] = _mm_packus_epi16(d[1], d[3]);
+      dd[2] = _mm_packus_epi16(d[4], d[4]);
+      dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+      // 00 10 01 11 20 30 21 31  40 50 41 51 60 70 61 71
+      // 02 12 03 13 22 32 23 33  42 52 43 53 62 72 63 73
+      // 04 14 05 15 24 34 25 35  44 54 45 55 64 74 65 75
+      d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+      d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+      d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+      // 00 10 01 11 02 12 03 13  20 30 21 31 22 32 23 33
+      // 40 50 41 51 42 52 43 53  60 70 61 71 62 72 63 73
+      // 04 14 05 15 xx xx xx xx  24 34 25 35 xx xx xx xx
+      // 44 54 45 55 xx xx xx xx  64 74 65 75 xx xx xx xx
+      dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+      dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+      dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+      dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+      // 00 10 01 11 02 12 03 13  04 14 05 15 xx xx xx xx
+      // 20 30 21 31 22 32 23 33  24 34 25 35 xx xx xx xx
+      // 40 50 41 51 42 52 43 53  44 54 45 55 xx xx xx xx
+      // 60 70 61 71 62 72 63 73  64 74 65 75 xx xx xx xx
+      d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+      d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+      d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+      d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+      // store 4 extra pixels
+      storeu_8bit_16x4(d, t, stride_hor);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      t += 12;
+      x -= 6;
+    } while (x);
+    src += 8 * src_stride - 4 * width_hor / 3;
+    t += 3 * stride_hor + 4;
+    y -= 8;
+  } while (y);
+
+  // vertical 8x6
+  x = width_ver;
+  t = temp_buffer;
+  do {
+    // 00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+    // 20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+    // 40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+    // 60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+    loadu_8bit_16x4(t, stride_hor, s);
+    y = height_ver;
+
+    do {
+      // 80 90 81 91 82 92 83 93  84 94 85 95 86 96 87 97
+      // A0 B0 A1 B1 A2 B2 A3 B3  A4 B4 A5 B5 A6 B6 A7 B7
+      // C0 D0 C1 D1 C2 D2 C3 D3  C4 D4 C5 D5 C6 D6 C7 D7
+      // E0 F0 E1 F1 E2 F2 E3 F3  E4 F4 E5 F5 E6 F6 E7 F7
+      t += 4 * stride_hor;
+      loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+      d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+      // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47  50 51 52 53 54 55 56 57
+      d[0] = _mm_packus_epi16(d[0], d[1]);
+      d[2] = _mm_packus_epi16(d[2], d[3]);
+      d[4] = _mm_packus_epi16(d[4], d[5]);
+
+      _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+      _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+      _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+      _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+      _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+      s[0] = s[4];
+      s[1] = s[5];
+      s[2] = s[6];
+      s[3] = s[7];
+
+      dst += 6 * dst_stride;
+      y -= 6;
+    } while (y);
+    t -= stride_hor * 2 * height_ver / 3;
+    t += 16;
+    dst -= height_ver * dst_stride;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
+                                                  const __m128i *const f) {
+  __m128i ss[4], temp;
+
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+  temp = convolve8_8_ssse3(ss, f);
+  return _mm_packus_epi16(temp, temp);
+}
+
+// Only calculate odd columns since even columns are just src pixels' copies.
+static void scale_1_to_2_phase_0_row(const uint8_t *src, uint8_t *dst,
+                                     const int w, const __m128i *const f) {
+  int x = w;
+
+  do {
+    __m128i s[8], temp;
+    s[0] = _mm_loadl_epi64((const __m128i *)(src + 0));
+    s[1] = _mm_loadl_epi64((const __m128i *)(src + 1));
+    s[2] = _mm_loadl_epi64((const __m128i *)(src + 2));
+    s[3] = _mm_loadl_epi64((const __m128i *)(src + 3));
+    s[4] = _mm_loadl_epi64((const __m128i *)(src + 4));
+    s[5] = _mm_loadl_epi64((const __m128i *)(src + 5));
+    s[6] = _mm_loadl_epi64((const __m128i *)(src + 6));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src + 7));
+    temp = scale_1_to_2_phase_0_kernel(s, f);
+    _mm_storel_epi64((__m128i *)dst, temp);
+    src += 8;
+    dst += 8;
+    x -= 8;
+  } while (x);
+}
+
+static void scale_plane_1_to_2_phase_0(const uint8_t *src,
+                                       const ptrdiff_t src_stride, uint8_t *dst,
+                                       const ptrdiff_t dst_stride,
+                                       const int src_w, const int src_h,
+                                       const int16_t *const coef,
+                                       uint8_t *const temp_buffer) {
+  int max_width;
+  int y;
+  uint8_t *tmp[9];
+  __m128i f[4];
+
+  max_width = (src_w + 7) & ~7;
+  tmp[0] = temp_buffer + 0 * max_width;
+  tmp[1] = temp_buffer + 1 * max_width;
+  tmp[2] = temp_buffer + 2 * max_width;
+  tmp[3] = temp_buffer + 3 * max_width;
+  tmp[4] = temp_buffer + 4 * max_width;
+  tmp[5] = temp_buffer + 5 * max_width;
+  tmp[6] = temp_buffer + 6 * max_width;
+  tmp[7] = temp_buffer + 7 * max_width;
+
+  shuffle_filter_ssse3(coef, f);
+
+  scale_1_to_2_phase_0_row(src - 3 * src_stride - 3, tmp[0], max_width, f);
+  scale_1_to_2_phase_0_row(src - 2 * src_stride - 3, tmp[1], max_width, f);
+  scale_1_to_2_phase_0_row(src - 1 * src_stride - 3, tmp[2], max_width, f);
+  scale_1_to_2_phase_0_row(src + 0 * src_stride - 3, tmp[3], max_width, f);
+  scale_1_to_2_phase_0_row(src + 1 * src_stride - 3, tmp[4], max_width, f);
+  scale_1_to_2_phase_0_row(src + 2 * src_stride - 3, tmp[5], max_width, f);
+  scale_1_to_2_phase_0_row(src + 3 * src_stride - 3, tmp[6], max_width, f);
+
+  y = src_h;
+  do {
+    int x;
+    scale_1_to_2_phase_0_row(src + 4 * src_stride - 3, tmp[7], max_width, f);
+    for (x = 0; x < max_width; x += 8) {
+      __m128i s[8], C, D, CD;
+
+      // Even rows
+      const __m128i a = _mm_loadl_epi64((const __m128i *)(src + x));
+      const __m128i b = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      const __m128i ab = _mm_unpacklo_epi8(a, b);
+      _mm_storeu_si128((__m128i *)(dst + 2 * x), ab);
+
+      // Odd rows
+      // Even columns
+      load_8bit_8x8(src + x - 3 * src_stride, src_stride, s);
+      C = scale_1_to_2_phase_0_kernel(s, f);
+
+      // Odd columns
+      s[0] = _mm_loadl_epi64((const __m128i *)(tmp[0] + x));
+      s[1] = _mm_loadl_epi64((const __m128i *)(tmp[1] + x));
+      s[2] = _mm_loadl_epi64((const __m128i *)(tmp[2] + x));
+      s[3] = _mm_loadl_epi64((const __m128i *)(tmp[3] + x));
+      s[4] = _mm_loadl_epi64((const __m128i *)(tmp[4] + x));
+      s[5] = _mm_loadl_epi64((const __m128i *)(tmp[5] + x));
+      s[6] = _mm_loadl_epi64((const __m128i *)(tmp[6] + x));
+      s[7] = _mm_loadl_epi64((const __m128i *)(tmp[7] + x));
+      D = scale_1_to_2_phase_0_kernel(s, f);
+
+      CD = _mm_unpacklo_epi8(C, D);
+      _mm_storeu_si128((__m128i *)(dst + dst_stride + 2 * x), CD);
     }
-  }
+
+    src += src_stride;
+    dst += 2 * dst_stride;
+    tmp[8] = tmp[0];
+    tmp[0] = tmp[1];
+    tmp[1] = tmp[2];
+    tmp[2] = tmp[3];
+    tmp[3] = tmp[4];
+    tmp[4] = tmp[5];
+    tmp[5] = tmp[6];
+    tmp[6] = tmp[7];
+    tmp[7] = tmp[8];
+  } while (--y);
 }
 
 void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
@@ -177,30 +756,152 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
   const int dst_h = dst->y_crop_height;
   const int dst_uv_w = dst_w / 2;
   const int dst_uv_h = dst_h / 2;
+  int scaled = 0;
 
-  if (dst_w * 2 == src_w && dst_h * 2 == src_h && phase_scaler == 0) {
-    downsample_2_to_1_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-    downsample_2_to_1_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    downsample_2_to_1_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-    vpx_extend_frame_borders(dst);
-  } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
-    // The upsample() supports widths up to 1920 * 2.  If greater, fall back
-    // to vp9_scale_and_extend_frame_c().
-    if (dst_w / 2 <= 1920) {
-      upsample_1_to_2_ssse3(src->y_buffer, src->y_stride, dst->y_buffer,
-                            dst->y_stride, dst_w, dst_h);
-      upsample_1_to_2_ssse3(src->u_buffer, src->uv_stride, dst->u_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      upsample_1_to_2_ssse3(src->v_buffer, src->uv_stride, dst->v_buffer,
-                            dst->uv_stride, dst_uv_w, dst_uv_h);
-      vpx_extend_frame_borders(dst);
+  // phase_scaler is usually 0 or 8.
+  assert(phase_scaler >= 0 && phase_scaler < 16);
+
+  if (dst_w * 2 == src_w && dst_h * 2 == src_h) {
+    // 2 to 1
+    scaled = 1;
+
+    if (phase_scaler == 0) {
+      scale_plane_2_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_2_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_2_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_2_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_2_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+    } else {
+      const int buffer_stride = (dst_w + 3) & ~3;
+      const int buffer_height = (2 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height);
+      if (temp_buffer) {
+        scale_plane_2_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_2_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_2_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
+    // 4 to 1
+    scaled = 1;
+    if (phase_scaler == 0) {
+      scale_plane_4_to_1_phase_0(src->y_buffer, src->y_stride, dst->y_buffer,
+                                 dst->y_stride, dst_w, dst_h);
+      scale_plane_4_to_1_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+      scale_plane_4_to_1_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h);
+    } else if (filter_type == BILINEAR) {
+      const int16_t c0 = vp9_filter_kernels[BILINEAR][phase_scaler][3];
+      const int16_t c1 = vp9_filter_kernels[BILINEAR][phase_scaler][4];
+      const __m128i c0c1 = _mm_set1_epi16(c0 | (c1 << 8));  // c0 and c1 >= 0
+      scale_plane_4_to_1_bilinear(src->y_buffer, src->y_stride, dst->y_buffer,
+                                  dst->y_stride, dst_w, dst_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
+      scale_plane_4_to_1_bilinear(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                  dst->uv_stride, dst_uv_w, dst_uv_h, c0c1);
     } else {
-      vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
+      const int buffer_stride = (dst_w + 1) & ~1;
+      const int buffer_height = (4 * dst_h + SUBPEL_TAPS - 2 + 7) & ~7;
+      // When dst_w is 1 or 2, we need extra padding to avoid heap read overflow
+      const int extra_padding = 16;
+      uint8_t *const temp_buffer =
+          (uint8_t *)malloc(buffer_stride * buffer_height + extra_padding);
+      if (temp_buffer) {
+        scale_plane_4_to_1_general(
+            src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+            dst_h, vp9_filter_kernels[filter_type][phase_scaler], temp_buffer);
+        scale_plane_4_to_1_general(
+            src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        scale_plane_4_to_1_general(
+            src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+            dst_uv_w, dst_uv_h, vp9_filter_kernels[filter_type][phase_scaler],
+            temp_buffer);
+        free(temp_buffer);
+      } else {
+        scaled = 0;
+      }
+    }
+  } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+    // 4 to 3
+    const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+    const int buffer_stride_ver = (dst_w + 7) & ~7;
+    const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+    // When the vertical filter reads more pixels than the horizontal filter
+    // generated in each row, we need extra padding to avoid heap read overflow.
+    // For example, the horizontal filter generates 18 pixels but the vertical
+    // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+    // two rows are interlaced together in the optimization.
+    const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+                                  ? 2 * (buffer_stride_ver - buffer_stride_hor)
+                                  : 0;
+    const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+    uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_4_to_3_general(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+          dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+      scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, dst_uv_w, dst_uv_h,
+                                 vp9_filter_kernels[filter_type], phase_scaler,
+                                 temp_buffer);
+      free(temp_buffer);
+    }
+  } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
+    // 1 to 2
+    uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
+    if (temp_buffer) {
+      scaled = 1;
+      scale_plane_1_to_2_phase_0(
+          src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, src_w,
+          src_h, vp9_filter_kernels[filter_type][8], temp_buffer);
+      scale_plane_1_to_2_phase_0(src->u_buffer, src->uv_stride, dst->u_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      scale_plane_1_to_2_phase_0(src->v_buffer, src->uv_stride, dst->v_buffer,
+                                 dst->uv_stride, src_w / 2, src_h / 2,
+                                 vp9_filter_kernels[filter_type][8],
+                                 temp_buffer);
+      free(temp_buffer);
     }
+  }
+
+  if (scaled) {
+    vpx_extend_frame_borders(dst);
   } else {
+    // Call c version for all other scaling ratios.
     vp9_scale_and_extend_frame_c(src, dst, filter_type, phase_scaler);
   }
 }
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 4a2581a34..ca0ad4407 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
@@ -25,8 +26,12 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i zero;
   __m128i thr;
   int16_t nzflag;
+  __m128i eob;
+  __m128i round, quant, dequant;
 
   (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan_ptr += n_coeffs;
@@ -35,40 +40,106 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
   zero = _mm_setzero_si128();
 
-  if (!skip_block) {
-    __m128i eob;
-    __m128i round, quant, dequant;
+  {
+    __m128i coeff0, coeff1;
+
+    // Setup global values
     {
-      __m128i coeff0, coeff1;
+      round = _mm_load_si128((const __m128i *)round_ptr);
+      quant = _mm_load_si128((const __m128i *)quant_ptr);
+      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+    }
 
-      // Setup global values
-      {
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      }
+    {
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+      // Do DC and first 15 AC
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      round = _mm_unpackhi_epi64(round, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+      quant = _mm_unpackhi_epi64(quant, quant);
+      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs
+      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      dequant = _mm_unpackhi_epi64(dequant, dequant);
+      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
+      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
+    }
 
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        // Do DC and first 15 AC
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+    {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob = _mm_max_epi16(eob, eob1);
+    }
+    n_coeffs += 8 * 2;
+  }
 
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    __m128i coeff0, coeff1;
+    {
+      __m128i coeff0_sign, coeff1_sign;
+      __m128i qcoeff0, qcoeff1;
+      __m128i qtmp0, qtmp1;
+
+      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+      // Poor man's sign extract
+      coeff0_sign = _mm_srai_epi16(coeff0, 15);
+      coeff1_sign = _mm_srai_epi16(coeff1, 15);
+      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
+      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
+      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+
+      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+      if (nzflag) {
         qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
         qcoeff1 = _mm_adds_epi16(qcoeff1, round);
         qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
         qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
         // Reinsert signs
@@ -81,131 +152,51 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
         store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
         coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
         coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
         store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
         store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
+      } else {
+        store_zero_tran_low(qcoeff_ptr + n_coeffs);
+        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
+        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
       }
-      n_coeffs += 8 * 2;
     }
 
-    thr = _mm_srai_epi16(dequant, 1);
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-                 _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-        if (nzflag) {
-          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-          // Reinsert signs
-          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-          store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-          store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-          store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-          store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-        } else {
-          store_zero_tran_low(qcoeff_ptr + n_coeffs);
-          store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-          store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-        }
-      }
-
-      if (nzflag) {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
+    if (nzflag) {
+      // Scan for eob
+      __m128i zero_coeff0, zero_coeff1;
+      __m128i nzero_coeff0, nzero_coeff1;
+      __m128i iscan0, iscan1;
+      __m128i eob0, eob1;
+      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
+      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
+      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
+      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
+      iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
+      iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
+      // Add one to convert from indices to counts
+      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
+      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
+      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
+      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
+      eob0 = _mm_max_epi16(eob0, eob1);
+      eob = _mm_max_epi16(eob, eob0);
     }
+    n_coeffs += 8 * 2;
+  }
 
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+  // Accumulate EOB
+  {
+    __m128i eob_shuffled;
+    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+    eob = _mm_max_epi16(eob, eob_shuffled);
+    *eob_ptr = _mm_extract_epi16(eob, 1);
   }
 }
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 1b88863f6..5703aa3bb 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -22,8 +22,6 @@ SECTION .text
 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
                                 qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
 
   ; actual quantize loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
@@ -171,28 +169,7 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
   pshuflw                         m7, m8, 0x1
   pmaxsw                          m8, m7
   pextrw                          r6, m8, 0
-  mov                           [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-  lea                            r0q, [r0q+ncoeffq*2]
-  lea                            r2q, [r2q+ncoeffq*2]
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq
-  STORE_ZERO_TRAN_LOW 7, r0q, ncoeffq + 8
-  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq
-  STORE_ZERO_TRAN_LOW 7, r2q, ncoeffq + 8
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                     word [r3q], 0
+  mov                           [r2], r6w
   RET
 %endmacro
 
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 25fc80a9a..881caae78 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -171,12 +171,17 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_overshoot_pct, 100);
   RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
+  RANGE_CHECK(cfg, rc_2pass_vbr_corpus_complexity, 0, 10000);
   RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO);
   RANGE_CHECK_BOOL(cfg, rc_resize_allowed);
   RANGE_CHECK_HI(cfg, rc_dropframe_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh, 100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
+#if CONFIG_REALTIME_ONLY
+  RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_ONE_PASS);
+#else
   RANGE_CHECK(cfg, g_pass, VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+#endif
   RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
   if (extra_cfg->max_gf_interval > 0) {
@@ -187,6 +192,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                 (MAX_LAG_BUFFERS - 1));
   }
 
+  // For formation of valid ARF groups lag_in _frames should be 0 or greater
+  // than the max_gf_interval + 2
+  if (cfg->g_lag_in_frames > 0 && extra_cfg->max_gf_interval > 0 &&
+      cfg->g_lag_in_frames < extra_cfg->max_gf_interval + 2) {
+    ERROR("Set lag in frames to 0 (low delay) or >= (max-gf-interval + 2)");
+  }
+
   if (cfg->rc_resize_allowed == 1) {
     RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
     RANGE_CHECK(cfg, rc_scaled_height, 0, cfg->g_h);
@@ -202,7 +214,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         level != LEVEL_4 && level != LEVEL_4_1 && level != LEVEL_5 &&
         level != LEVEL_5_1 && level != LEVEL_5_2 && level != LEVEL_6 &&
         level != LEVEL_6_1 && level != LEVEL_6_2 && level != LEVEL_UNKNOWN &&
-        level != LEVEL_MAX)
+        level != LEVEL_AUTO && level != LEVEL_MAX)
       ERROR("target_level is invalid");
   }
 
@@ -269,6 +281,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
   if (extra_cfg->tuning == VP8_TUNE_SSIM)
     ERROR("Option --tune=ssim is not currently supported in VP9.");
 
+#if !CONFIG_REALTIME_ONLY
   if (cfg->g_pass == VPX_RC_LAST_PASS) {
     const size_t packet_sz = sizeof(FIRSTPASS_STATS);
     const int n_packets = (int)(cfg->rc_twopass_stats_in.sz / packet_sz);
@@ -320,6 +333,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         ERROR("rc_twopass_stats_in missing EOS stats packet");
     }
   }
+#endif  // !CONFIG_REALTIME_ONLY
 
 #if !CONFIG_VP9_HIGHBITDEPTH
   if (cfg->g_profile > (unsigned int)PROFILE_1) {
@@ -425,10 +439,20 @@ static void config_target_level(VP9EncoderConfig *oxcf) {
   oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63);
 
   // Adjust minimum art-ref distance.
-  if (oxcf->min_gf_interval <
-      (int)vp9_level_defs[target_level_index].min_altref_distance)
+  // min_gf_interval should be no less than min_altref_distance + 1,
+  // as the encoder may produce bitstream with alt-ref distance being
+  // min_gf_interval - 1.
+  if (oxcf->min_gf_interval <=
+      (int)vp9_level_defs[target_level_index].min_altref_distance) {
     oxcf->min_gf_interval =
-        (int)vp9_level_defs[target_level_index].min_altref_distance;
+        (int)vp9_level_defs[target_level_index].min_altref_distance + 1;
+    // If oxcf->max_gf_interval == 0, it will be assigned with a default value
+    // in vp9_rc_set_gf_interval_range().
+    if (oxcf->max_gf_interval != 0) {
+      oxcf->max_gf_interval =
+          VPXMAX(oxcf->max_gf_interval, oxcf->min_gf_interval);
+    }
+  }
 
   // Adjust maximum column tiles.
   if (vp9_level_defs[target_level_index].max_col_tiles <
@@ -503,6 +527,7 @@ static vpx_codec_err_t set_encoder_config(
   oxcf->two_pass_vbrbias = cfg->rc_2pass_vbr_bias_pct;
   oxcf->two_pass_vbrmin_section = cfg->rc_2pass_vbr_minsection_pct;
   oxcf->two_pass_vbrmax_section = cfg->rc_2pass_vbr_maxsection_pct;
+  oxcf->vbr_corpus_complexity = cfg->rc_2pass_vbr_corpus_complexity;
 
   oxcf->auto_key =
       cfg->kf_mode == VPX_KF_AUTO && cfg->kf_min_dist != cfg->kf_max_dist;
@@ -613,6 +638,7 @@ static vpx_codec_err_t set_encoder_config(
   printf("two_pass_vbrbias: %d\n",  oxcf->two_pass_vbrbias);
   printf("two_pass_vbrmin_section: %d\n", oxcf->two_pass_vbrmin_section);
   printf("two_pass_vbrmax_section: %d\n", oxcf->two_pass_vbrmax_section);
+  printf("vbr_corpus_complexity: %d\n",  oxcf->vbr_corpus_complexity);
   printf("lag_in_frames: %d\n", oxcf->lag_in_frames);
   printf("enable_auto_arf: %d\n", oxcf->enable_auto_arf);
   printf("Version: %d\n", oxcf->Version);
@@ -888,12 +914,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
     priv->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
     if (priv->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&priv->buffer_pool->pool_mutex, NULL)) {
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-
     if (ctx->config.enc) {
       // Update the reference to the config structure to an internal copy.
       priv->cfg = *ctx->config.enc;
@@ -925,9 +945,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
 static vpx_codec_err_t encoder_destroy(vpx_codec_alg_priv_t *ctx) {
   free(ctx->cx_data);
   vp9_remove_compressor(ctx->cpi);
-#if CONFIG_MULTITHREAD
-  pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
@@ -938,6 +955,10 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
                                     unsigned long deadline) {
   MODE new_mode = BEST;
 
+#if CONFIG_REALTIME_ONLY
+  (void)duration;
+  deadline = VPX_DL_REALTIME;
+#else
   switch (ctx->cfg.g_pass) {
     case VPX_RC_ONE_PASS:
       if (deadline > 0) {
@@ -958,6 +979,7 @@ static void pick_quickcompress_mode(vpx_codec_alg_priv_t *ctx,
     case VPX_RC_FIRST_PASS: break;
     case VPX_RC_LAST_PASS: new_mode = deadline > 0 ? GOOD : BEST; break;
   }
+#endif  // CONFIG_REALTIME_ONLY
 
   if (deadline == VPX_DL_REALTIME) {
     ctx->oxcf.pass = 0;
@@ -1266,8 +1288,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
         cx_data += size;
         cx_data_sz -= size;
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
         if (cpi->use_svc && !ctx->output_cx_pkt_cb.output_cx_pkt) {
           vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
           int sl;
@@ -1288,7 +1309,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
           vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
         }
 #endif
-#endif
         if (is_one_pass_cbr_svc(cpi) &&
             (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)) {
           // Encoded all spatial layers; exit loop.
@@ -1679,6 +1699,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         50,    // rc_two_pass_vbrbias
         0,     // rc_two_pass_vbrmin_section
         2000,  // rc_two_pass_vbrmax_section
+        0,     // rc_2pass_vbr_corpus_complexity (non 0 for corpus vbr)
 
         // keyframing settings (kf)
         VPX_KF_AUTO,  // g_kfmode
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 1da1794b7..657490f4b 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -47,9 +47,6 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
     ctx->priv->init_flags = ctx->init_flags;
     priv->si.sz = sizeof(priv->si);
     priv->flushed = 0;
-    // TODO(jzern): remnants of frame-level parallel decoding should be
-    // removed. cf., https://bugs.chromium.org/p/webm/issues/detail?id=1395
-    priv->frame_parallel_decode = 0;
     if (ctx->config.dec) {
       priv->cfg = *ctx->config.dec;
       ctx->config.dec = &priv->cfg;
@@ -60,33 +57,8 @@ static vpx_codec_err_t decoder_init(vpx_codec_ctx_t *ctx,
 }
 
 static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
-  if (ctx->frame_workers != NULL) {
-    int i;
-    // Shutdown all threads before reclaiming any memory. The frame-level
-    // parallel decoder may access data from another worker.
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      vpx_get_worker_interface()->end(worker);
-    }
-    for (i = 0; i < ctx->num_frame_workers; ++i) {
-      VPxWorker *const worker = &ctx->frame_workers[i];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      vp9_remove_common(&frame_worker_data->pbi->common);
-#if CONFIG_VP9_POSTPROC
-      vp9_free_postproc_buffers(&frame_worker_data->pbi->common);
-#endif
-      vp9_decoder_remove(frame_worker_data->pbi);
-      vpx_free(frame_worker_data->scratch_buffer);
-#if CONFIG_MULTITHREAD
-      pthread_mutex_destroy(&frame_worker_data->stats_mutex);
-      pthread_cond_destroy(&frame_worker_data->stats_cond);
-#endif
-      vpx_free(frame_worker_data);
-    }
-#if CONFIG_MULTITHREAD
-    pthread_mutex_destroy(&ctx->buffer_pool->pool_mutex);
-#endif
+  if (ctx->pbi != NULL) {
+    vp9_decoder_remove(ctx->pbi);
   }
 
   if (ctx->buffer_pool) {
@@ -94,7 +66,6 @@ static vpx_codec_err_t decoder_destroy(vpx_codec_alg_priv_t *ctx) {
     vp9_free_internal_frame_buffers(&ctx->buffer_pool->int_frame_buffers);
   }
 
-  vpx_free(ctx->frame_workers);
   vpx_free(ctx->buffer_pool);
   vpx_free(ctx);
   return VPX_CODEC_OK;
@@ -228,32 +199,26 @@ static vpx_codec_err_t update_error_state(
 }
 
 static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
-  int i;
-
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    BufferPool *const pool = cm->buffer_pool;
-
-    cm->new_fb_idx = INVALID_IDX;
-    cm->byte_alignment = ctx->byte_alignment;
-    cm->skip_loop_filter = ctx->skip_loop_filter;
-
-    if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
-      pool->get_fb_cb = ctx->get_ext_fb_cb;
-      pool->release_fb_cb = ctx->release_ext_fb_cb;
-      pool->cb_priv = ctx->ext_priv;
-    } else {
-      pool->get_fb_cb = vp9_get_frame_buffer;
-      pool->release_fb_cb = vp9_release_frame_buffer;
+  VP9_COMMON *const cm = &ctx->pbi->common;
+  BufferPool *const pool = cm->buffer_pool;
 
-      if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to initialize internal frame buffers");
+  cm->new_fb_idx = INVALID_IDX;
+  cm->byte_alignment = ctx->byte_alignment;
+  cm->skip_loop_filter = ctx->skip_loop_filter;
 
-      pool->cb_priv = &pool->int_frame_buffers;
-    }
+  if (ctx->get_ext_fb_cb != NULL && ctx->release_ext_fb_cb != NULL) {
+    pool->get_fb_cb = ctx->get_ext_fb_cb;
+    pool->release_fb_cb = ctx->release_ext_fb_cb;
+    pool->cb_priv = ctx->ext_priv;
+  } else {
+    pool->get_fb_cb = vp9_get_frame_buffer;
+    pool->release_fb_cb = vp9_release_frame_buffer;
+
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                         "Failed to initialize internal frame buffers");
+
+    pool->cb_priv = &pool->int_frame_buffers;
   }
 }
 
@@ -270,124 +235,21 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   flags->noise_level = ctx->postproc_cfg.noise_level;
 }
 
-static int frame_worker_hook(void *arg1, void *arg2) {
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)arg1;
-  const uint8_t *data = frame_worker_data->data;
-  (void)arg2;
-
-  frame_worker_data->result = vp9_receive_compressed_data(
-      frame_worker_data->pbi, frame_worker_data->data_size, &data);
-  frame_worker_data->data_end = data;
-
-  if (frame_worker_data->pbi->frame_parallel_decode) {
-    // In frame parallel decoding, a worker thread must successfully decode all
-    // the compressed data.
-    if (frame_worker_data->result != 0 ||
-        frame_worker_data->data + frame_worker_data->data_size - 1 > data) {
-      VPxWorker *const worker = frame_worker_data->pbi->frame_worker_owner;
-      BufferPool *const pool = frame_worker_data->pbi->common.buffer_pool;
-      // Signal all the other threads that are waiting for this frame.
-      vp9_frameworker_lock_stats(worker);
-      frame_worker_data->frame_context_ready = 1;
-      lock_buffer_pool(pool);
-      frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-      unlock_buffer_pool(pool);
-      frame_worker_data->pbi->need_resync = 1;
-      vp9_frameworker_signal_stats(worker);
-      vp9_frameworker_unlock_stats(worker);
-      return 0;
-    }
-  } else if (frame_worker_data->result != 0) {
-    // Check decode result in serial decode.
-    frame_worker_data->pbi->cur_buf->buf.corrupted = 1;
-    frame_worker_data->pbi->need_resync = 1;
-  }
-  return !frame_worker_data->result;
-}
-
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
-  int i;
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-
   ctx->last_show_frame = -1;
-  ctx->next_submit_worker_id = 0;
-  ctx->last_submit_worker_id = 0;
-  ctx->next_output_worker_id = 0;
-  ctx->frame_cache_read = 0;
-  ctx->frame_cache_write = 0;
-  ctx->num_cache_frames = 0;
   ctx->need_resync = 1;
-  ctx->num_frame_workers =
-      (ctx->frame_parallel_decode == 1) ? ctx->cfg.threads : 1;
-  if (ctx->num_frame_workers > MAX_DECODE_THREADS)
-    ctx->num_frame_workers = MAX_DECODE_THREADS;
-  ctx->available_threads = ctx->num_frame_workers;
   ctx->flushed = 0;
 
   ctx->buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(BufferPool));
   if (ctx->buffer_pool == NULL) return VPX_CODEC_MEM_ERROR;
 
-#if CONFIG_MULTITHREAD
-  if (pthread_mutex_init(&ctx->buffer_pool->pool_mutex, NULL)) {
-    set_error_detail(ctx, "Failed to allocate buffer pool mutex");
+  ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
+  if (ctx->pbi == NULL) {
+    set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
-#endif
-
-  ctx->frame_workers = (VPxWorker *)vpx_malloc(ctx->num_frame_workers *
-                                               sizeof(*ctx->frame_workers));
-  if (ctx->frame_workers == NULL) {
-    set_error_detail(ctx, "Failed to allocate frame_workers");
-    return VPX_CODEC_MEM_ERROR;
-  }
-
-  for (i = 0; i < ctx->num_frame_workers; ++i) {
-    VPxWorker *const worker = &ctx->frame_workers[i];
-    FrameWorkerData *frame_worker_data = NULL;
-    winterface->init(worker);
-    worker->data1 = vpx_memalign(32, sizeof(FrameWorkerData));
-    if (worker->data1 == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi = vp9_decoder_create(ctx->buffer_pool);
-    if (frame_worker_data->pbi == NULL) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data");
-      return VPX_CODEC_MEM_ERROR;
-    }
-    frame_worker_data->pbi->frame_worker_owner = worker;
-    frame_worker_data->worker_id = i;
-    frame_worker_data->scratch_buffer = NULL;
-    frame_worker_data->scratch_buffer_size = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 0;
-#if CONFIG_MULTITHREAD
-    if (pthread_mutex_init(&frame_worker_data->stats_mutex, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data mutex");
-      return VPX_CODEC_MEM_ERROR;
-    }
-
-    if (pthread_cond_init(&frame_worker_data->stats_cond, NULL)) {
-      set_error_detail(ctx, "Failed to allocate frame_worker_data cond");
-      return VPX_CODEC_MEM_ERROR;
-    }
-#endif
-    // If decoding in serial mode, FrameWorker thread could create tile worker
-    // thread or loopfilter thread.
-    frame_worker_data->pbi->max_threads =
-        (ctx->frame_parallel_decode == 0) ? ctx->cfg.threads : 0;
-
-    frame_worker_data->pbi->inv_tile_order = ctx->invert_tile_order;
-    frame_worker_data->pbi->frame_parallel_decode = ctx->frame_parallel_decode;
-    frame_worker_data->pbi->common.frame_parallel_decode =
-        ctx->frame_parallel_decode;
-    worker->hook = (VPxWorkerHook)frame_worker_hook;
-    if (!winterface->reset(worker)) {
-      set_error_detail(ctx, "Frame Worker thread creation failed");
-      return VPX_CODEC_MEM_ERROR;
-    }
-  }
+  ctx->pbi->max_threads = ctx->cfg.threads;
+  ctx->pbi->inv_tile_order = ctx->invert_tile_order;
 
   // If postprocessing was enabled by the application and a
   // configuration has not been provided, default it.
@@ -401,7 +263,7 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
                                 const VP9Decoder *const pbi) {
-  // Clear resync flag if worker got a key frame or intra only frame.
+  // Clear resync flag if the decoder got a key frame or intra only frame.
   if (ctx->need_resync == 1 && pbi->need_resync == 0 &&
       (pbi->common.intra_only || pbi->common.frame_type == KEY_FRAME))
     ctx->need_resync = 0;
@@ -410,7 +272,6 @@ static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   (void)deadline;
 
   // Determine the stream parameters. Note that we rely on peek_si to
@@ -426,101 +287,23 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
     if (!ctx->si.is_kf && !is_intra_only) return VPX_CODEC_ERROR;
   }
 
-  if (!ctx->frame_parallel_decode) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->data = *data;
-    frame_worker_data->data_size = data_sz;
-    frame_worker_data->user_priv = user_priv;
-    frame_worker_data->received_frame = 1;
-
-    // Set these even if already initialized.  The caller may have changed the
-    // decrypt config between frames.
-    frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
-    frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
-
-    worker->had_error = 0;
-    winterface->execute(worker);
+  ctx->user_priv = user_priv;
 
-    // Update data pointer after decode.
-    *data = frame_worker_data->data_end;
+  // Set these even if already initialized.  The caller may have changed the
+  // decrypt config between frames.
+  ctx->pbi->decrypt_cb = ctx->decrypt_cb;
+  ctx->pbi->decrypt_state = ctx->decrypt_state;
 
-    if (worker->had_error)
-      return update_error_state(ctx, &frame_worker_data->pbi->common.error);
-
-    check_resync(ctx, frame_worker_data->pbi);
-  } else {
-    VPxWorker *const worker = &ctx->frame_workers[ctx->next_submit_worker_id];
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    // Copy context from last worker thread to next worker thread.
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      vp9_frameworker_copy_context(
-          &ctx->frame_workers[ctx->next_submit_worker_id],
-          &ctx->frame_workers[ctx->last_submit_worker_id]);
-
-    frame_worker_data->pbi->ready_for_new_data = 0;
-    // Copy the compressed data into worker's internal buffer.
-    // TODO(hkuang): Will all the workers allocate the same size
-    // as the size of the first intra frame be better? This will
-    // avoid too many deallocate and allocate.
-    if (frame_worker_data->scratch_buffer_size < data_sz) {
-      vpx_free(frame_worker_data->scratch_buffer);
-      frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz);
-      if (frame_worker_data->scratch_buffer == NULL) {
-        set_error_detail(ctx, "Failed to reallocate scratch buffer");
-        return VPX_CODEC_MEM_ERROR;
-      }
-      frame_worker_data->scratch_buffer_size = data_sz;
-    }
-    frame_worker_data->data_size = data_sz;
-    memcpy(frame_worker_data->scratch_buffer, *data, data_sz);
-
-    frame_worker_data->frame_decoded = 0;
-    frame_worker_data->frame_context_ready = 0;
-    frame_worker_data->received_frame = 1;
-    frame_worker_data->data = frame_worker_data->scratch_buffer;
-    frame_worker_data->user_priv = user_priv;
-
-    if (ctx->next_submit_worker_id != ctx->last_submit_worker_id)
-      ctx->last_submit_worker_id =
-          (ctx->last_submit_worker_id + 1) % ctx->num_frame_workers;
-
-    ctx->next_submit_worker_id =
-        (ctx->next_submit_worker_id + 1) % ctx->num_frame_workers;
-    --ctx->available_threads;
-    worker->had_error = 0;
-    winterface->launch(worker);
+  if (vp9_receive_compressed_data(ctx->pbi, data_sz, data)) {
+    ctx->pbi->cur_buf->buf.corrupted = 1;
+    ctx->pbi->need_resync = 1;
+    ctx->need_resync = 1;
+    return update_error_state(ctx, &ctx->pbi->common.error);
   }
 
-  return VPX_CODEC_OK;
-}
+  check_resync(ctx, ctx->pbi);
 
-static void wait_worker_and_cache_frame(vpx_codec_alg_priv_t *ctx) {
-  YV12_BUFFER_CONFIG sd;
-  vp9_ppflags_t flags = { 0, 0, 0 };
-  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-  FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-  ctx->next_output_worker_id =
-      (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-  // TODO(hkuang): Add worker error handling here.
-  winterface->sync(worker);
-  frame_worker_data->received_frame = 0;
-  ++ctx->available_threads;
-
-  check_resync(ctx, frame_worker_data->pbi);
-
-  if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-    VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-    ctx->frame_cache[ctx->frame_cache_write].fb_idx = cm->new_fb_idx;
-    yuvconfig2image(&ctx->frame_cache[ctx->frame_cache_write].img, &sd,
-                    frame_worker_data->user_priv);
-    ctx->frame_cache[ctx->frame_cache_write].img.fb_priv =
-        frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-    ctx->frame_cache_write = (ctx->frame_cache_write + 1) % FRAME_CACHE_SIZE;
-    ++ctx->num_cache_frames;
-  }
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
@@ -540,8 +323,8 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   // Reset flushed when receiving a valid frame.
   ctx->flushed = 0;
 
-  // Initialize the decoder workers on the first frame.
-  if (ctx->frame_workers == NULL) {
+  // Initialize the decoder on the first frame.
+  if (ctx->pbi == NULL) {
     const vpx_codec_err_t res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
@@ -553,91 +336,37 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1)
     frame_count = ctx->svc_spatial_layer + 1;
 
-  if (ctx->frame_parallel_decode) {
-    // Decode in frame parallel mode. When decoding in this mode, the frame
-    // passed to the decoder must be either a normal frame or a superframe with
-    // superframe index so the decoder could get each frame's start position
-    // in the superframe.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
-
-        if (ctx->available_threads == 0) {
-          // No more threads for decoding. Wait until the next output worker
-          // finishes decoding. Then copy the decoded frame into cache.
-          if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-            wait_worker_and_cache_frame(ctx);
-          } else {
-            // TODO(hkuang): Add unit test to test this path.
-            set_error_detail(ctx, "Frame output cache is full.");
-            return VPX_CODEC_ERROR;
-          }
-        }
+  // Decode in serial mode.
+  if (frame_count > 0) {
+    int i;
 
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-        data_start += frame_size;
-      }
-    } else {
-      if (ctx->available_threads == 0) {
-        // No more threads for decoding. Wait until the next output worker
-        // finishes decoding. Then copy the decoded frame into cache.
-        if (ctx->num_cache_frames < FRAME_CACHE_SIZE) {
-          wait_worker_and_cache_frame(ctx);
-        } else {
-          // TODO(hkuang): Add unit test to test this path.
-          set_error_detail(ctx, "Frame output cache is full.");
-          return VPX_CODEC_ERROR;
-        }
+    for (i = 0; i < frame_count; ++i) {
+      const uint8_t *data_start_copy = data_start;
+      const uint32_t frame_size = frame_sizes[i];
+      vpx_codec_err_t res;
+      if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
+        set_error_detail(ctx, "Invalid frame size in index");
+        return VPX_CODEC_CORRUPT_FRAME;
       }
 
-      res = decode_one(ctx, &data, data_sz, user_priv, deadline);
+      res = decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
       if (res != VPX_CODEC_OK) return res;
+
+      data_start += frame_size;
     }
   } else {
-    // Decode in serial mode.
-    if (frame_count > 0) {
-      int i;
-
-      for (i = 0; i < frame_count; ++i) {
-        const uint8_t *data_start_copy = data_start;
-        const uint32_t frame_size = frame_sizes[i];
-        vpx_codec_err_t res;
-        if (data_start < data ||
-            frame_size > (uint32_t)(data_end - data_start)) {
-          set_error_detail(ctx, "Invalid frame size in index");
-          return VPX_CODEC_CORRUPT_FRAME;
-        }
-
-        res =
-            decode_one(ctx, &data_start_copy, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
+    while (data_start < data_end) {
+      const uint32_t frame_size = (uint32_t)(data_end - data_start);
+      const vpx_codec_err_t res =
+          decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      if (res != VPX_CODEC_OK) return res;
 
-        data_start += frame_size;
-      }
-    } else {
+      // Account for suboptimal termination by the encoder.
       while (data_start < data_end) {
-        const uint32_t frame_size = (uint32_t)(data_end - data_start);
-        const vpx_codec_err_t res =
-            decode_one(ctx, &data_start, frame_size, user_priv, deadline);
-        if (res != VPX_CODEC_OK) return res;
-
-        // Account for suboptimal termination by the encoder.
-        while (data_start < data_end) {
-          const uint8_t marker =
-              read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
-          if (marker) break;
-          ++data_start;
-        }
+        const uint8_t marker =
+            read_marker(ctx->decrypt_cb, ctx->decrypt_state, data_start);
+        if (marker) break;
+        ++data_start;
       }
     }
   }
@@ -645,80 +374,28 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
   return res;
 }
 
-static void release_last_output_frame(vpx_codec_alg_priv_t *ctx) {
-  RefCntBuffer *const frame_bufs = ctx->buffer_pool->frame_bufs;
-  // Decrease reference count of last output frame in frame parallel mode.
-  if (ctx->frame_parallel_decode && ctx->last_show_frame >= 0) {
-    BufferPool *const pool = ctx->buffer_pool;
-    lock_buffer_pool(pool);
-    decrease_ref_count(ctx->last_show_frame, frame_bufs, pool);
-    unlock_buffer_pool(pool);
-  }
-}
-
 static vpx_image_t *decoder_get_frame(vpx_codec_alg_priv_t *ctx,
                                       vpx_codec_iter_t *iter) {
   vpx_image_t *img = NULL;
 
-  // Only return frame when all the cpu are busy or
-  // application fluhsed the decoder in frame parallel decode.
-  if (ctx->frame_parallel_decode && ctx->available_threads > 0 &&
-      !ctx->flushed) {
-    return NULL;
-  }
+  // Legacy parameter carried over from VP8. Has no effect for VP9 since we
+  // always return only 1 frame per decode call.
+  (void)iter;
 
-  // Output the frames in the cache first.
-  if (ctx->num_cache_frames > 0) {
-    release_last_output_frame(ctx);
-    ctx->last_show_frame = ctx->frame_cache[ctx->frame_cache_read].fb_idx;
-    if (ctx->need_resync) return NULL;
-    img = &ctx->frame_cache[ctx->frame_cache_read].img;
-    ctx->frame_cache_read = (ctx->frame_cache_read + 1) % FRAME_CACHE_SIZE;
-    --ctx->num_cache_frames;
-    return img;
-  }
-
-  // iter acts as a flip flop, so an image is only returned on the first
-  // call to get_frame.
-  if (*iter == NULL && ctx->frame_workers != NULL) {
-    do {
-      YV12_BUFFER_CONFIG sd;
-      vp9_ppflags_t flags = { 0, 0, 0 };
-      const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-      VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      ctx->next_output_worker_id =
-          (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
-      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-        set_ppflags(ctx, &flags);
-      // Wait for the frame from worker thread.
-      if (winterface->sync(worker)) {
-        // Check if worker has received any frames.
-        if (frame_worker_data->received_frame == 1) {
-          ++ctx->available_threads;
-          frame_worker_data->received_frame = 0;
-          check_resync(ctx, frame_worker_data->pbi);
-        }
-        if (vp9_get_raw_frame(frame_worker_data->pbi, &sd, &flags) == 0) {
-          VP9_COMMON *const cm = &frame_worker_data->pbi->common;
-          RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
-          release_last_output_frame(ctx);
-          ctx->last_show_frame = frame_worker_data->pbi->common.new_fb_idx;
-          if (ctx->need_resync) return NULL;
-          yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
-          ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
-          img = &ctx->img;
-          return img;
-        }
-      } else {
-        // Decoding failed. Release the worker thread.
-        frame_worker_data->received_frame = 0;
-        ++ctx->available_threads;
-        ctx->need_resync = 1;
-        if (ctx->flushed != 1) return NULL;
-      }
-    } while (ctx->next_output_worker_id != ctx->next_submit_worker_id);
+  if (ctx->pbi != NULL) {
+    YV12_BUFFER_CONFIG sd;
+    vp9_ppflags_t flags = { 0, 0, 0 };
+    if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC) set_ppflags(ctx, &flags);
+    if (vp9_get_raw_frame(ctx->pbi, &sd, &flags) == 0) {
+      VP9_COMMON *const cm = &ctx->pbi->common;
+      RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+      ctx->last_show_frame = ctx->pbi->common.new_fb_idx;
+      if (ctx->need_resync) return NULL;
+      yuvconfig2image(&ctx->img, &sd, ctx->user_priv);
+      ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
+      img = &ctx->img;
+      return img;
+    }
   }
   return NULL;
 }
@@ -728,7 +405,7 @@ static vpx_codec_err_t decoder_set_fb_fn(
     vpx_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) {
   if (cb_get == NULL || cb_release == NULL) {
     return VPX_CODEC_INVALID_PARAM;
-  } else if (ctx->frame_workers == NULL) {
+  } else if (ctx->pbi == NULL) {
     // If the decoder has already been initialized, do not accept changes to
     // the frame buffer functions.
     ctx->get_ext_fb_cb = cb_get;
@@ -744,21 +421,12 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vpx_ref_frame_t *const data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *const frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_set_reference_dec(&frame_worker_data->pbi->common,
-                                 ref_frame_to_vp9_reframe(frame->frame_type),
-                                 &sd);
+    return vp9_set_reference_dec(
+        &ctx->pbi->common, ref_frame_to_vp9_reframe(frame->frame_type), &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -768,20 +436,12 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_ref_frame_t *data = va_arg(args, vpx_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     vpx_ref_frame_t *frame = (vpx_ref_frame_t *)data;
     YV12_BUFFER_CONFIG sd;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
-    return vp9_copy_reference_dec(frame_worker_data->pbi,
-                                  (VP9_REFFRAME)frame->frame_type, &sd);
+    return vp9_copy_reference_dec(ctx->pbi, (VP9_REFFRAME)frame->frame_type,
+                                  &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -791,17 +451,9 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   vp9_ref_frame_t *data = va_arg(args, vp9_ref_frame_t *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (data) {
     YV12_BUFFER_CONFIG *fb;
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    fb = get_ref_frame(&frame_worker_data->pbi->common, data->idx);
+    fb = get_ref_frame(&ctx->pbi->common, data->idx);
     if (fb == NULL) return VPX_CODEC_ERROR;
     yuvconfig2image(&data->img, fb, NULL);
     return VPX_CODEC_OK;
@@ -832,9 +484,8 @@ static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   int *const arg = va_arg(args, int *);
-  if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
-  *arg =
-      ((FrameWorkerData *)ctx->frame_workers[0].data1)->pbi->common.base_qindex;
+  if (arg == NULL || ctx->pbi == NULL) return VPX_CODEC_INVALID_PARAM;
+  *arg = ctx->pbi->common.base_qindex;
   return VPX_CODEC_OK;
 }
 
@@ -842,18 +493,9 @@ static vpx_codec_err_t ctrl_get_last_ref_updates(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   int *const update_info = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (update_info) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      *update_info = frame_worker_data->pbi->refresh_frame_flags;
+    if (ctx->pbi != NULL) {
+      *update_info = ctx->pbi->refresh_frame_flags;
       return VPX_CODEC_OK;
     } else {
       return VPX_CODEC_ERROR;
@@ -868,14 +510,9 @@ static vpx_codec_err_t ctrl_get_frame_corrupted(vpx_codec_alg_priv_t *ctx,
   int *corrupted = va_arg(args, int *);
 
   if (corrupted) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      RefCntBuffer *const frame_bufs =
-          frame_worker_data->pbi->common.buffer_pool->frame_bufs;
-      if (frame_worker_data->pbi->common.frame_to_show == NULL)
-        return VPX_CODEC_ERROR;
+    if (ctx->pbi != NULL) {
+      RefCntBuffer *const frame_bufs = ctx->pbi->common.buffer_pool->frame_bufs;
+      if (ctx->pbi->common.frame_to_show == NULL) return VPX_CODEC_ERROR;
       if (ctx->last_show_frame >= 0)
         *corrupted = frame_bufs[ctx->last_show_frame].buf.corrupted;
       return VPX_CODEC_OK;
@@ -891,18 +528,9 @@ static vpx_codec_err_t ctrl_get_frame_size(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   int *const frame_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (frame_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       frame_size[0] = cm->width;
       frame_size[1] = cm->height;
       return VPX_CODEC_OK;
@@ -918,18 +546,9 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
                                             va_list args) {
   int *const render_size = va_arg(args, int *);
 
-  // Only support this function in serial decode.
-  if (ctx->frame_parallel_decode) {
-    set_error_detail(ctx, "Not supported in frame parallel decode");
-    return VPX_CODEC_INCAPABLE;
-  }
-
   if (render_size) {
-    if (ctx->frame_workers) {
-      VPxWorker *const worker = ctx->frame_workers;
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       render_size[0] = cm->render_width;
       render_size[1] = cm->render_height;
       return VPX_CODEC_OK;
@@ -944,13 +563,10 @@ static vpx_codec_err_t ctrl_get_render_size(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_get_bit_depth(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
   unsigned int *const bit_depth = va_arg(args, unsigned int *);
-  VPxWorker *const worker = &ctx->frame_workers[ctx->next_output_worker_id];
 
   if (bit_depth) {
-    if (worker) {
-      FrameWorkerData *const frame_worker_data =
-          (FrameWorkerData *)worker->data1;
-      const VP9_COMMON *const cm = &frame_worker_data->pbi->common;
+    if (ctx->pbi != NULL) {
+      const VP9_COMMON *const cm = &ctx->pbi->common;
       *bit_depth = cm->bit_depth;
       return VPX_CODEC_OK;
     } else {
@@ -989,10 +605,8 @@ static vpx_codec_err_t ctrl_set_byte_alignment(vpx_codec_alg_priv_t *ctx,
     return VPX_CODEC_INVALID_PARAM;
 
   ctx->byte_alignment = byte_alignment;
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.byte_alignment = byte_alignment;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.byte_alignment = byte_alignment;
   }
   return VPX_CODEC_OK;
 }
@@ -1001,10 +615,8 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx,
                                                  va_list args) {
   ctx->skip_loop_filter = va_arg(args, int);
 
-  if (ctx->frame_workers) {
-    VPxWorker *const worker = ctx->frame_workers;
-    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
-    frame_worker_data->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
+  if (ctx->pbi != NULL) {
+    ctx->pbi->common.skip_loop_filter = ctx->skip_loop_filter;
   }
 
   return VPX_CODEC_OK;
diff --git a/libvpx/vp9/vp9_dx_iface.h b/libvpx/vp9/vp9_dx_iface.h
index c1559599b..18bc7ab0d 100644
--- a/libvpx/vp9/vp9_dx_iface.h
+++ b/libvpx/vp9/vp9_dx_iface.h
@@ -15,19 +15,12 @@
 
 typedef vpx_codec_stream_info_t vp9_stream_info_t;
 
-// This limit is due to framebuffer numbers.
-// TODO(hkuang): Remove this limit after implementing ondemand framebuffers.
-#define FRAME_CACHE_SIZE 6  // Cache maximum 6 decoded frames.
-
-typedef struct cache_frame {
-  int fb_idx;
-  vpx_image_t img;
-} cache_frame;
-
 struct vpx_codec_alg_priv {
   vpx_codec_priv_t base;
   vpx_codec_dec_cfg_t cfg;
   vp9_stream_info_t si;
+  VP9Decoder *pbi;
+  void *user_priv;
   int postproc_cfg_set;
   vp8_postproc_cfg_t postproc_cfg;
   vpx_decrypt_cb decrypt_cb;
@@ -40,20 +33,8 @@ struct vpx_codec_alg_priv {
   int byte_alignment;
   int skip_loop_filter;
 
-  // Frame parallel related.
-  int frame_parallel_decode;  // frame-based threading.
-  VPxWorker *frame_workers;
-  int num_frame_workers;
-  int next_submit_worker_id;
-  int last_submit_worker_id;
-  int next_output_worker_id;
-  int available_threads;
-  cache_frame frame_cache[FRAME_CACHE_SIZE];
-  int frame_cache_write;
-  int frame_cache_read;
-  int num_cache_frames;
   int need_resync;  // wait for key/intra-only frame
-  // BufferPool that holds all reference frames. Shared by all the FrameWorkers.
+  // BufferPool that holds all reference frames.
   BufferPool *buffer_pool;
 
   // External frame buffer info to save for VP9 common.
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 47846c941..d633ed142 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -130,6 +130,7 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
 endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -138,4 +139,10 @@ VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct8x8_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
 
+# Strip unnecessary files with CONFIG_REALTIME_ONLY
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
+
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/libvpx/vp9/vp9dx.mk b/libvpx/vp9/vp9dx.mk
index 4c6fd0071..59f612b94 100644
--- a/libvpx/vp9/vp9dx.mk
+++ b/libvpx/vp9/vp9dx.mk
@@ -24,8 +24,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.h
-VP9_DX_SRCS-yes += decoder/vp9_dthread.c
-VP9_DX_SRCS-yes += decoder/vp9_dthread.h
 VP9_DX_SRCS-yes += decoder/vp9_decoder.c
 VP9_DX_SRCS-yes += decoder/vp9_decoder.h
 VP9_DX_SRCS-yes += decoder/vp9_dsubexp.c
diff --git a/libvpx/vpx/src/svc_encodeframe.c b/libvpx/vpx/src/svc_encodeframe.c
index c774abb34..f633600c7 100644
--- a/libvpx/vpx/src/svc_encodeframe.c
+++ b/libvpx/vpx/src/svc_encodeframe.c
@@ -131,9 +131,9 @@ static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
 static vpx_codec_err_t extract_option(LAYER_OPTION_TYPE type, char *input,
                                       int *value0, int *value1) {
   if (type == SCALE_FACTOR) {
-    *value0 = strtol(input, &input, 10);
+    *value0 = (int)strtol(input, &input, 10);
     if (*input++ != '/') return VPX_CODEC_INVALID_PARAM;
-    *value1 = strtol(input, &input, 10);
+    *value1 = (int)strtol(input, &input, 10);
 
     if (*value0 < option_min_values[SCALE_FACTOR] ||
         *value1 < option_min_values[SCALE_FACTOR] ||
@@ -559,8 +559,7 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
-#if CONFIG_SPATIAL_SVC
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
       case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
         int i;
         for (i = 0; i < svc_ctx->spatial_layers; ++i) {
@@ -595,9 +594,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
         break;
       }
 #endif
-#endif
       case VPX_CODEC_PSNR_PKT: {
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+#if CONFIG_SPATIAL_SVC && defined(VPX_TEST_SPATIAL_SVC)
         int j;
         svc_log(svc_ctx, SVC_LOG_DEBUG,
                 "frame: %d, layer: %d, PSNR(Total/Y/U/V): "
diff --git a/libvpx/vpx/src/vpx_encoder.c b/libvpx/vpx/src/vpx_encoder.c
index 4390cf7c8..1cf2dca69 100644
--- a/libvpx/vpx/src/vpx_encoder.c
+++ b/libvpx/vpx/src/vpx_encoder.c
@@ -12,8 +12,11 @@
  * \brief Provides the high level interface to wrap encoder algorithms.
  *
  */
+#include <assert.h>
 #include <limits.h>
+#include <stdlib.h>
 #include <string.h>
+#include "vp8/common/blockd.h"
 #include "vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
@@ -81,6 +84,8 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
     int i;
     void *mem_loc = NULL;
 
+    if (iface->enc.mr_get_mem_loc == NULL) return VPX_CODEC_INCAPABLE;
+
     if (!(res = iface->enc.mr_get_mem_loc(cfg, &mem_loc))) {
       for (i = 0; i < num_enc; i++) {
         vpx_codec_priv_enc_mr_cfg_t mr_cfg;
@@ -89,28 +94,27 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
         if (dsf->num < 1 || dsf->num > 4096 || dsf->den < 1 ||
             dsf->den > dsf->num) {
           res = VPX_CODEC_INVALID_PARAM;
-          break;
+        } else {
+          mr_cfg.mr_low_res_mode_info = mem_loc;
+          mr_cfg.mr_total_resolutions = num_enc;
+          mr_cfg.mr_encoder_id = num_enc - 1 - i;
+          mr_cfg.mr_down_sampling_factor.num = dsf->num;
+          mr_cfg.mr_down_sampling_factor.den = dsf->den;
+
+          /* Force Key-frame synchronization. Namely, encoder at higher
+           * resolution always use the same frame_type chosen by the
+           * lowest-resolution encoder.
+           */
+          if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
+
+          ctx->iface = iface;
+          ctx->name = iface->name;
+          ctx->priv = NULL;
+          ctx->init_flags = flags;
+          ctx->config.enc = cfg;
+          res = ctx->iface->init(ctx, &mr_cfg);
         }
 
-        mr_cfg.mr_low_res_mode_info = mem_loc;
-        mr_cfg.mr_total_resolutions = num_enc;
-        mr_cfg.mr_encoder_id = num_enc - 1 - i;
-        mr_cfg.mr_down_sampling_factor.num = dsf->num;
-        mr_cfg.mr_down_sampling_factor.den = dsf->den;
-
-        /* Force Key-frame synchronization. Namely, encoder at higher
-         * resolution always use the same frame_type chosen by the
-         * lowest-resolution encoder.
-         */
-        if (mr_cfg.mr_encoder_id) cfg->kf_mode = VPX_KF_DISABLED;
-
-        ctx->iface = iface;
-        ctx->name = iface->name;
-        ctx->priv = NULL;
-        ctx->init_flags = flags;
-        ctx->config.enc = cfg;
-        res = ctx->iface->init(ctx, &mr_cfg);
-
         if (res) {
           const char *error_detail = ctx->priv ? ctx->priv->err_detail : NULL;
           /* Destroy current ctx */
@@ -124,10 +128,14 @@ vpx_codec_err_t vpx_codec_enc_init_multi_ver(
             vpx_codec_destroy(ctx);
             i--;
           }
+#if CONFIG_MULTI_RES_ENCODING
+          assert(mem_loc);
+          free(((LOWER_RES_FRAME_INFO *)mem_loc)->mb_info);
+          free(mem_loc);
+#endif
+          return SAVE_STATUS(ctx, res);
         }
 
-        if (res) break;
-
         ctx++;
         cfg++;
         dsf++;
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index ee6be4a24..c21b8b60d 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -333,11 +333,12 @@ enum vp8e_enc_control_id {
    *             2 = 4 tile columns
    *             .....
    *             n = 2**n tile columns
-   * The requested tile columns will be capped by encoder based on image size
-   * limitation (The minimum width of a tile column is 256 pixel, the maximum
-   * is 4096).
+   * The requested tile columns will be capped by the encoder based on image
+   * size limitations (The minimum width of a tile column is 256 pixels, the
+   * maximum is 4096).
    *
-   * By default, the value is 0, i.e. one single column tile for entire image.
+   * By default, the value is 6, i.e., the maximum number of tiles supported by
+   * the resolution.
    *
    * Supported in codecs: VP9
    */
@@ -368,10 +369,10 @@ enum vp8e_enc_control_id {
    * VP9 has a bitstream feature to reduce decoding dependency between frames
    * by turning off backward update of probability context used in encoding
    * and decoding. This allows staged parallel processing of more than one
-   * video frames in the decoder. This control function provides a mean to
+   * video frame in the decoder. This control function provides a means to
    * turn this feature on or off for bitstreams produced by encoder.
    *
-   * By default, this feature is off.
+   * By default, this feature is on.
    *
    * Supported in codecs: VP9
    */
@@ -407,7 +408,7 @@ enum vp8e_enc_control_id {
 
   /*!\brief Codec control function to set noise sensitivity.
    *
-   *  0: off, 1: On(YOnly)
+   *  0: off, 1: On(YOnly), 2: For SVC only, on top two spatial layers(YOnly)
    *
    * Supported in codecs: VP9
    */
@@ -443,6 +444,7 @@ enum vp8e_enc_control_id {
    * \note Valid parameter range:
    *              VP9E_CONTENT_DEFAULT = Regular video content (Default)
    *              VP9E_CONTENT_SCREEN  = Screen capture content
+   *              VP9E_CONTENT_FILM    = Film content: improves grain retention
    *
    * Supported in codecs: VP9
    */
@@ -695,6 +697,7 @@ typedef enum {
 typedef enum {
   VP9E_CONTENT_DEFAULT,
   VP9E_CONTENT_SCREEN,
+  VP9E_CONTENT_FILM,
   VP9E_CONTENT_INVALID
 } vp9e_tune_content;
 
diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h
index 41c53e48d..398c67022 100644
--- a/libvpx/vpx/vp8dx.h
+++ b/libvpx/vpx/vp8dx.h
@@ -179,6 +179,8 @@ VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
 #define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
 VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/libvpx/vpx/vpx_codec.h b/libvpx/vpx/vpx_codec.h
index e91cd9e0d..ad05f4c74 100644
--- a/libvpx/vpx/vpx_codec.h
+++ b/libvpx/vpx/vpx_codec.h
@@ -46,34 +46,35 @@ extern "C" {
 #include "./vpx_integer.h"
 
 /*!\brief Decorator indicating a function is deprecated */
-#ifndef DEPRECATED
+#ifndef VPX_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DEPRECATED __attribute__((deprecated))
+#define VPX_DEPRECATED __attribute__((deprecated))
 #elif defined(_MSC_VER)
-#define DEPRECATED
+#define VPX_DEPRECATED
 #else
-#define DEPRECATED
+#define VPX_DEPRECATED
 #endif
-#endif /* DEPRECATED */
+#endif /* VPX_DEPRECATED */
 
-#ifndef DECLSPEC_DEPRECATED
+#ifndef VPX_DECLSPEC_DEPRECATED
 #if defined(__GNUC__) && __GNUC__
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #elif defined(_MSC_VER)
-/*!\brief \copydoc #DEPRECATED */
-#define DECLSPEC_DEPRECATED __declspec(deprecated)
+/*!\brief \copydoc #VPX_DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED __declspec(deprecated)
 #else
-#define DECLSPEC_DEPRECATED /**< \copydoc #DEPRECATED */
+#define VPX_DECLSPEC_DEPRECATED /**< \copydoc #VPX_DEPRECATED */
 #endif
-#endif /* DECLSPEC_DEPRECATED */
+#endif /* VPX_DECLSPEC_DEPRECATED */
 
 /*!\brief Decorator indicating a function is potentially unused */
-#ifdef UNUSED
-#elif defined(__GNUC__) || defined(__clang__)
-#define UNUSED __attribute__((unused))
+#ifndef VPX_UNUSED
+#if defined(__GNUC__) || defined(__clang__)
+#define VPX_UNUSED __attribute__((unused))
 #else
-#define UNUSED
+#define VPX_UNUSED
 #endif
+#endif /* VPX_UNUSED */
 
 /*!\brief Current ABI version number
  *
@@ -413,7 +414,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_USE_TYPE(id, typ)                                           \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int, typ) \
-      UNUSED;                                                                \
+      VPX_UNUSED;                                                            \
                                                                              \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,        \
                                                 int ctrl_id, typ data) {     \
@@ -430,13 +431,13 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  * It defines a static function with the correctly typed arguments as a
  * wrapper to the type-unsafe internal function.
  */
-#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                        \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *, int, typ) DEPRECATED UNUSED;                \
-                                                                     \
-  DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
-      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                 \
-    return vpx_codec_control_(ctx, ctrl_id, data);                   \
+#define VPX_CTRL_USE_TYPE_DEPRECATED(id, typ)                            \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *, int, typ) VPX_DEPRECATED VPX_UNUSED;            \
+                                                                         \
+  VPX_DECLSPEC_DEPRECATED static vpx_codec_err_t vpx_codec_control_##id( \
+      vpx_codec_ctx_t *ctx, int ctrl_id, typ data) {                     \
+    return vpx_codec_control_(ctx, ctrl_id, data);                       \
   } /**<\hideinitializer*/
 
 /*!\brief vpx_codec_control void type definition macro
@@ -451,7 +452,7 @@ vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...);
  */
 #define VPX_CTRL_VOID(id)                                               \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *, int) \
-      UNUSED;                                                           \
+      VPX_UNUSED;                                                       \
                                                                         \
   static vpx_codec_err_t vpx_codec_control_##id(vpx_codec_ctx_t *ctx,   \
                                                 int ctrl_id) {          \
diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index c915ed671..464bc408c 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
  * fields to structures
  */
 #define VPX_ENCODER_ABI_VERSION \
-  (5 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+  (6 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -154,9 +154,8 @@ enum vpx_codec_cx_pkt_kind {
   VPX_CODEC_STATS_PKT,      /**< Two-pass statistics for this frame */
   VPX_CODEC_FPMB_STATS_PKT, /**< first pass mb statistics for this frame */
   VPX_CODEC_PSNR_PKT,       /**< PSNR statistics for this frame */
-// Spatial SVC is still experimental and may be removed before the next ABI
-// bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
   VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
   VPX_CODEC_SPATIAL_SVC_LAYER_PSNR,  /**< PSNR for each layer in this frame*/
 #endif
@@ -192,9 +191,8 @@ typedef struct vpx_codec_cx_pkt {
       double psnr[4];          /**< PSNR, total/y/u/v */
     } psnr;                    /**< data for PSNR packet */
     vpx_fixed_buf_t raw;       /**< data for arbitrary packets */
-// Spatial SVC is still experimental and may be removed before the next
-// ABI bump.
-#if VPX_ENCODER_ABI_VERSION > (5 + VPX_CODEC_ABI_VERSION)
+// Spatial SVC is still experimental and may be removed.
+#if defined(VPX_TEST_SPATIAL_SVC)
     size_t layer_sizes[VPX_SS_MAX_LAYERS];
     struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
 #endif
@@ -508,25 +506,31 @@ typedef struct vpx_codec_enc_cfg {
 
   /*!\brief Rate control adaptation undershoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be subtracted from the target bitrate in order to compensate
    * for prior overshoot.
-   *
-   * Valid values in the range 0-1000.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * undershoot level (current rate vs target) beyond which more agressive
+   * corrective measures are taken.
+   *   *
+   * Valid values in the range VP8:0-1000 VP9: 0-100.
    */
   unsigned int rc_undershoot_pct;
 
   /*!\brief Rate control adaptation overshoot control
    *
-   * This value, expressed as a percentage of the target bitrate,
+   * VP8: Expressed as a percentage of the target bitrate,
    * controls the maximum allowed adaptation speed of the codec.
    * This factor controls the maximum amount of bits that can
    * be added to the target bitrate in order to compensate for
    * prior undershoot.
+   * VP9: Expressed as a percentage of the target bitrate, a threshold
+   * overshoot level (current rate vs target) beyond which more agressive
+   * corrective measures are taken.
    *
-   * Valid values in the range 0-1000.
+   * Valid values in the range VP8:0-1000 VP9: 0-100.
    */
   unsigned int rc_overshoot_pct;
 
@@ -591,6 +595,13 @@ typedef struct vpx_codec_enc_cfg {
    */
   unsigned int rc_2pass_vbr_maxsection_pct;
 
+  /*!\brief Two-pass corpus vbr mode complexity control
+  * Used only in VP9: A value representing the corpus midpoint complexity
+  * for corpus vbr mode. This value defaults to 0 which disables corpus vbr
+  * mode in favour of normal vbr mode.
+  */
+  unsigned int rc_2pass_vbr_corpus_complexity;
+
   /*
    * keyframing settings (kf)
    */
diff --git a/libvpx/vpx_dsp/add_noise.c b/libvpx/vpx_dsp/add_noise.c
index a2b4c9010..cda6ae881 100644
--- a/libvpx/vpx_dsp/add_noise.c
+++ b/libvpx/vpx_dsp/add_noise.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/postproc.h"
 #include "vpx_ports/mem.h"
 
 void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp,
diff --git a/libvpx/vpx_dsp/arm/avg_neon.c b/libvpx/vpx_dsp/arm/avg_neon.c
index 257e8ffee..fa7dd0960 100644
--- a/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/libvpx/vpx_dsp/arm/avg_neon.c
@@ -17,51 +17,35 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/idct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
-  const uint32x4_t a = vpaddlq_u16(v_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
+uint32_t vpx_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint8x16_t b = load_unaligned_u8q(a, a_stride);
+  const uint16x8_t c = vaddl_u8(vget_low_u8(b), vget_high_u8(b));
+  const uint32x2_t d = horizontal_add_uint16x8(c);
+  return vget_lane_u32(vrshr_n_u32(d, 4), 0);
 }
 
-unsigned int vpx_avg_4x4_neon(const uint8_t *s, int p) {
-  uint16x8_t v_sum;
-  uint32x2_t v_s0 = vdup_n_u32(0);
-  uint32x2_t v_s1 = vdup_n_u32(0);
-  v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
-  v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
-  v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
-  v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
-  return (horizontal_add_u16x8(v_sum) + 8) >> 4;
-}
-
-unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
-  uint8x8_t v_s0 = vld1_u8(s);
-  const uint8x8_t v_s1 = vld1_u8(s + p);
-  uint16x8_t v_sum = vaddl_u8(v_s0, v_s1);
-
-  v_s0 = vld1_u8(s + 2 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 3 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 4 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 5 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
-
-  v_s0 = vld1_u8(s + 6 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
+uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  int i;
+  uint8x8_t b, c;
+  uint16x8_t sum;
+  uint32x2_t d;
+  b = vld1_u8(a);
+  a += a_stride;
+  c = vld1_u8(a);
+  a += a_stride;
+  sum = vaddl_u8(b, c);
+
+  for (i = 0; i < 6; ++i) {
+    const uint8x8_t d = vld1_u8(a);
+    a += a_stride;
+    sum = vaddw_u8(sum, d);
+  }
 
-  v_s0 = vld1_u8(s + 7 * p);
-  v_sum = vaddw_u8(v_sum, v_s0);
+  d = horizontal_add_uint16x8(sum);
 
-  return (horizontal_add_u16x8(v_sum) + 32) >> 6;
+  return vget_lane_u32(vrshr_n_u32(d, 6), 0);
 }
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
@@ -155,7 +139,8 @@ int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
     ref += 16;
   }
 
-  return horizontal_add_u16x8(vec_sum);
+  return vget_lane_s16(vreinterpret_s16_u32(horizontal_add_uint16x8(vec_sum)),
+                       0);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
@@ -185,7 +170,7 @@ int vpx_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) {
 
   {
     // Note: 'total''s pairwise addition could be implemented similarly to
-    // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired
+    // horizontal_add_uint16x8(), but one less vpaddl with 'total' when paired
     // with the summation of 'sse' performed better on a Cortex-A15.
     const int32x4_t t0 = vpaddlq_s16(total);  // cascading summation of 'total'
     const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0));
diff --git a/libvpx/vpx_dsp/arm/avg_pred_neon.c b/libvpx/vpx_dsp/arm/avg_pred_neon.c
new file mode 100644
index 000000000..1370ec2d2
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  if (width > 8) {
+    int x, y;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; x += 16) {
+        const uint8x16_t p = vld1q_u8(pred + x);
+        const uint8x16_t r = vld1q_u8(ref + x);
+        const uint8x16_t avg = vrhaddq_u8(p, r);
+        vst1q_u8(comp + x, avg);
+      }
+      comp += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    int i;
+    for (i = 0; i < width * height; i += 16) {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+
+      if (width == 4) {
+        r = load_unaligned_u8q(ref, ref_stride);
+        ref += 4 * ref_stride;
+      } else {
+        const uint8x8_t r_0 = vld1_u8(ref);
+        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+        assert(width == 8);
+        r = vcombine_u8(r_0, r_1);
+        ref += 2 * ref_stride;
+      }
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
new file mode 100644
index 000000000..6b2bebd09
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -0,0 +1,387 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
+// functions.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct16x16_c(input, output, stride);
+}
+
+#else
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/, const int pass) {
+  if (pass == 0) {
+    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+  } else {
+    b[0] = vaddq_s16(a[0], a[15]);
+    b[1] = vaddq_s16(a[1], a[14]);
+    b[2] = vaddq_s16(a[2], a[13]);
+    b[3] = vaddq_s16(a[3], a[12]);
+    b[4] = vaddq_s16(a[4], a[11]);
+    b[5] = vaddq_s16(a[5], a[10]);
+    b[6] = vaddq_s16(a[6], a[9]);
+    b[7] = vaddq_s16(a[7], a[8]);
+
+    b[8] = vsubq_s16(a[7], a[8]);
+    b[9] = vsubq_s16(a[6], a[9]);
+    b[10] = vsubq_s16(a[5], a[10]);
+    b[11] = vsubq_s16(a[4], a[11]);
+    b[12] = vsubq_s16(a[3], a[12]);
+    b[13] = vsubq_s16(a[2], a[13]);
+    b[14] = vsubq_s16(a[1], a[14]);
+    b[15] = vsubq_s16(a[0], a[15]);
+  }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t c, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t c0,
+                                       const tran_coef_t c1, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+                                 int16x8_t *b /*[8]*/) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+                      &out[7]);
+  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+                      &out[15]);
+  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+                      &out[3]);
+  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+                      &out[11]);
+}
+
+void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[16];
+  int16x8_t temp1[16];
+  int16x8_t temp2[16];
+  int16x8_t temp3[16];
+
+  // Left half.
+  load(input, stride, temp0);
+  cross_input(temp0, temp1, 0);
+  dct_body(temp1, temp0);
+
+  // Right half.
+  load(input + 8, stride, temp1);
+  cross_input(temp1, temp2, 0);
+  dct_body(temp2, temp1);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+  transpose_8x8(&temp0[0], &temp2[0]);
+  transpose_8x8(&temp1[0], &temp2[8]);
+  partial_round_shift(temp2);
+  cross_input(temp2, temp3, 1);
+  dct_body(temp3, temp2);
+  transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
+                    &temp2[5], &temp2[6], &temp2[7]);
+  transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
+                    &temp2[13], &temp2[14], &temp2[15]);
+  store(output, temp2);
+  store(output + 8, temp2 + 8);
+  output += 8 * 16;
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  partial_round_shift(temp1);
+  cross_input(temp1, temp0, 1);
+  dct_body(temp0, temp1);
+  transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
+                    &temp1[5], &temp1[6], &temp1[7]);
+  transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
+                    &temp1[13], &temp1[14], &temp1[15]);
+  store(output, temp1);
+  store(output + 8, temp1 + 8);
+}
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
new file mode 100644
index 000000000..e9cd34904
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -0,0 +1,1507 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+// Most gcc 4.9 distributions outside of Android do not generate correct code
+// for this function.
+#if !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) && \
+    __GNUC__ == 4 && __GNUC_MINOR__ <= 9
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  vpx_fdct32x32_c(input, output, stride);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  vpx_fdct32x32_rd_c(input, output, stride);
+}
+
+#else
+
+#define LOAD_INCREMENT(src, stride, dest, index) \
+  do {                                           \
+    dest[index] = vld1q_s16(src);                \
+    src += stride;                               \
+  } while (0)
+
+#define ADD_S16(src, index0, index1, dest, index3)      \
+  do {                                                  \
+    dest[index3] = vaddq_s16(src[index0], src[index1]); \
+  } while (0)
+
+#define ADD_SHIFT_S16(src, index0, index1)                             \
+  do {                                                                 \
+    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
+  } while (0)
+
+// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
+// middle
+// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
+  const int16_t *a_end = a + 24 * stride;
+  int16x8_t c[8];
+
+  LOAD_INCREMENT(a, stride, b, 0);
+  LOAD_INCREMENT(a, stride, b, 1);
+  LOAD_INCREMENT(a, stride, b, 2);
+  LOAD_INCREMENT(a, stride, b, 3);
+  LOAD_INCREMENT(a, stride, b, 4);
+  LOAD_INCREMENT(a, stride, b, 5);
+  LOAD_INCREMENT(a, stride, b, 6);
+  LOAD_INCREMENT(a, stride, b, 7);
+
+  LOAD_INCREMENT(a_end, stride, b, 24);
+  LOAD_INCREMENT(a_end, stride, b, 25);
+  LOAD_INCREMENT(a_end, stride, b, 26);
+  LOAD_INCREMENT(a_end, stride, b, 27);
+  LOAD_INCREMENT(a_end, stride, b, 28);
+  LOAD_INCREMENT(a_end, stride, b, 29);
+  LOAD_INCREMENT(a_end, stride, b, 30);
+  LOAD_INCREMENT(a_end, stride, b, 31);
+
+  ADD_S16(b, 0, 31, c, 0);
+  ADD_S16(b, 1, 30, c, 1);
+  ADD_S16(b, 2, 29, c, 2);
+  ADD_S16(b, 3, 28, c, 3);
+  ADD_S16(b, 4, 27, c, 4);
+  ADD_S16(b, 5, 26, c, 5);
+  ADD_S16(b, 6, 25, c, 6);
+  ADD_S16(b, 7, 24, c, 7);
+
+  ADD_SHIFT_S16(b, 7, 24);
+  ADD_SHIFT_S16(b, 6, 25);
+  ADD_SHIFT_S16(b, 5, 26);
+  ADD_SHIFT_S16(b, 4, 27);
+  ADD_SHIFT_S16(b, 3, 28);
+  ADD_SHIFT_S16(b, 2, 29);
+  ADD_SHIFT_S16(b, 1, 30);
+  ADD_SHIFT_S16(b, 0, 31);
+
+  b[0] = vshlq_n_s16(c[0], 2);
+  b[1] = vshlq_n_s16(c[1], 2);
+  b[2] = vshlq_n_s16(c[2], 2);
+  b[3] = vshlq_n_s16(c[3], 2);
+  b[4] = vshlq_n_s16(c[4], 2);
+  b[5] = vshlq_n_s16(c[5], 2);
+  b[6] = vshlq_n_s16(c[6], 2);
+  b[7] = vshlq_n_s16(c[7], 2);
+
+  LOAD_INCREMENT(a, stride, b, 8);
+  LOAD_INCREMENT(a, stride, b, 9);
+  LOAD_INCREMENT(a, stride, b, 10);
+  LOAD_INCREMENT(a, stride, b, 11);
+  LOAD_INCREMENT(a, stride, b, 12);
+  LOAD_INCREMENT(a, stride, b, 13);
+  LOAD_INCREMENT(a, stride, b, 14);
+  LOAD_INCREMENT(a, stride, b, 15);
+  LOAD_INCREMENT(a, stride, b, 16);
+  LOAD_INCREMENT(a, stride, b, 17);
+  LOAD_INCREMENT(a, stride, b, 18);
+  LOAD_INCREMENT(a, stride, b, 19);
+  LOAD_INCREMENT(a, stride, b, 20);
+  LOAD_INCREMENT(a, stride, b, 21);
+  LOAD_INCREMENT(a, stride, b, 22);
+  LOAD_INCREMENT(a, stride, b, 23);
+
+  ADD_S16(b, 8, 23, c, 0);
+  ADD_S16(b, 9, 22, c, 1);
+  ADD_S16(b, 10, 21, c, 2);
+  ADD_S16(b, 11, 20, c, 3);
+  ADD_S16(b, 12, 19, c, 4);
+  ADD_S16(b, 13, 18, c, 5);
+  ADD_S16(b, 14, 17, c, 6);
+  ADD_S16(b, 15, 16, c, 7);
+
+  ADD_SHIFT_S16(b, 15, 16);
+  ADD_SHIFT_S16(b, 14, 17);
+  ADD_SHIFT_S16(b, 13, 18);
+  ADD_SHIFT_S16(b, 12, 19);
+  ADD_SHIFT_S16(b, 11, 20);
+  ADD_SHIFT_S16(b, 10, 21);
+  ADD_SHIFT_S16(b, 9, 22);
+  ADD_SHIFT_S16(b, 8, 23);
+
+  b[8] = vshlq_n_s16(c[0], 2);
+  b[9] = vshlq_n_s16(c[1], 2);
+  b[10] = vshlq_n_s16(c[2], 2);
+  b[11] = vshlq_n_s16(c[3], 2);
+  b[12] = vshlq_n_s16(c[4], 2);
+  b[13] = vshlq_n_s16(c[5], 2);
+  b[14] = vshlq_n_s16(c[6], 2);
+  b[15] = vshlq_n_s16(c[7], 2);
+}
+
+#undef LOAD_INCREMENT
+#undef ADD_S16
+#undef ADD_SHIFT_S16
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0);
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t constant,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant0,
+                                       const tran_coef_t constant1,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
+  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
+  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
+  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift(b[0]);
+  out[16] = sub_round_shift(b[1]);
+  out[8] = sub_round_shift(b[2]);
+  out[24] = sub_round_shift(b[3]);
+  out[4] = sub_round_shift(b[4]);
+  out[20] = sub_round_shift(b[5]);
+  out[12] = sub_round_shift(b[6]);
+  out[28] = sub_round_shift(b[7]);
+  out[2] = sub_round_shift(b[8]);
+  out[18] = sub_round_shift(b[9]);
+  out[10] = sub_round_shift(b[10]);
+  out[26] = sub_round_shift(b[11]);
+  out[6] = sub_round_shift(b[12]);
+  out[22] = sub_round_shift(b[13]);
+  out[14] = sub_round_shift(b[14]);
+  out[30] = sub_round_shift(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
+  out[1] = sub_round_shift(a[1]);
+  out[31] = sub_round_shift(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
+  out[17] = sub_round_shift(a[17]);
+  out[15] = sub_round_shift(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
+  out[9] = sub_round_shift(a[9]);
+  out[23] = sub_round_shift(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
+  out[25] = sub_round_shift(a[25]);
+  out[7] = sub_round_shift(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
+  out[5] = sub_round_shift(a[5]);
+  out[27] = sub_round_shift(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
+  out[21] = sub_round_shift(a[21]);
+  out[11] = sub_round_shift(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
+  out[13] = sub_round_shift(a[13]);
+  out[19] = sub_round_shift(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
+  out[29] = sub_round_shift(a[29]);
+  out[3] = sub_round_shift(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but don't narrow results.
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+// Like butterfly_one_coeff, but with s32.
+static INLINE void butterfly_one_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
+  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
+  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
+                          sub_index)                                          \
+  do {                                                                        \
+    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
+                            a##_lo[right_index], a##_hi[right_index],         \
+                            constant, &b##_lo[add_index], &b##_hi[add_index], \
+                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
+  } while (0)
+
+// Like butterfly_two_coeff, but with s32.
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
+  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
+  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
+  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
+  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
+                                            const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
+  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
+  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
+  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
+  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
+  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
+  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
+  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
+  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+// TODO(johannkoenig): share with other fdcts.
+static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  dct_body_first_pass(temp0, temp1);
+
+  load(input + 8, stride, temp0);
+  dct_body_first_pass(temp0, temp2);
+
+  load(input + 16, stride, temp0);
+  dct_body_first_pass(temp0, temp3);
+
+  load(input + 24, stride, temp0);
+  dct_body_first_pass(temp0, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+
+void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                           int stride) {
+  int16x8_t temp0[32];
+  int16x8_t temp1[32];
+  int16x8_t temp2[32];
+  int16x8_t temp3[32];
+  int16x8_t temp4[32];
+  int16x8_t temp5[32];
+
+  // Process in 8x32 columns.
+  load(input, stride, temp0);
+  dct_body_first_pass(temp0, temp1);
+
+  load(input + 8, stride, temp0);
+  dct_body_first_pass(temp0, temp2);
+
+  load(input + 16, stride, temp0);
+  dct_body_first_pass(temp0, temp3);
+
+  load(input + 24, stride, temp0);
+  dct_body_first_pass(temp0, temp4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_8x8(&temp1[0], &temp0[0]);
+  transpose_8x8(&temp2[0], &temp0[8]);
+  transpose_8x8(&temp3[0], &temp0[16]);
+  transpose_8x8(&temp4[0], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output, temp5);
+
+  // Second row of 8x32.
+  transpose_8x8(&temp1[8], &temp0[0]);
+  transpose_8x8(&temp2[8], &temp0[8]);
+  transpose_8x8(&temp3[8], &temp0[16]);
+  transpose_8x8(&temp4[8], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 8 * 32, temp5);
+
+  // Third row of 8x32
+  transpose_8x8(&temp1[16], &temp0[0]);
+  transpose_8x8(&temp2[16], &temp0[8]);
+  transpose_8x8(&temp3[16], &temp0[16]);
+  transpose_8x8(&temp4[16], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 16 * 32, temp5);
+
+  // Final row of 8x32.
+  transpose_8x8(&temp1[24], &temp0[0]);
+  transpose_8x8(&temp2[24], &temp0[8]);
+  transpose_8x8(&temp3[24], &temp0[16]);
+  transpose_8x8(&temp4[24], &temp0[24]);
+
+  dct_body_second_pass_rd(temp0, temp5);
+
+  transpose_s16_8x8(&temp5[0], &temp5[1], &temp5[2], &temp5[3], &temp5[4],
+                    &temp5[5], &temp5[6], &temp5[7]);
+  transpose_s16_8x8(&temp5[8], &temp5[9], &temp5[10], &temp5[11], &temp5[12],
+                    &temp5[13], &temp5[14], &temp5[15]);
+  transpose_s16_8x8(&temp5[16], &temp5[17], &temp5[18], &temp5[19], &temp5[20],
+                    &temp5[21], &temp5[22], &temp5[23]);
+  transpose_s16_8x8(&temp5[24], &temp5[25], &temp5[26], &temp5[27], &temp5[28],
+                    &temp5[29], &temp5[30], &temp5[31]);
+  store(output + 24 * 32, temp5);
+}
+#endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
+        // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct_neon.c
index fe78f3f51..04646ed2e 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -50,8 +50,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
     const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
     const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int16_t)cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int16_t)cospi_16_64);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
 
     // fdct_round_shift
     int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
@@ -59,13 +59,11 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
     // s_3 * cospi_8_64 + s_2 * cospi_24_64
     // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int16_t)cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int16_t)cospi_24_64);
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
 
-    const int32x4_t temp3 =
-        vmlal_n_s16(s_3_cospi_8_64, s_2, (int16_t)cospi_24_64);
-    const int32x4_t temp4 =
-        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int16_t)cospi_8_64);
+    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
 
     // fdct_round_shift
     int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
new file mode 100644
index 000000000..e73de41d7
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -0,0 +1,113 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE tran_low_t get_lane(const int32x2_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  return vget_lane_s32(a, 0);
+#else
+  return vget_lane_s16(vreinterpret_s16_s32(a), 0);
+#endif  // CONFIG_VP9_HIGHBITDETPH
+}
+
+void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int16x4_t a0, a1, a2, a3;
+  int16x8_t b0, b1;
+  int16x8_t c;
+  int32x2_t d;
+
+  a0 = vld1_s16(input);
+  input += stride;
+  a1 = vld1_s16(input);
+  input += stride;
+  a2 = vld1_s16(input);
+  input += stride;
+  a3 = vld1_s16(input);
+
+  b0 = vcombine_s16(a0, a1);
+  b1 = vcombine_s16(a2, a3);
+
+  c = vaddq_s16(b0, b1);
+
+  d = horizontal_add_int16x8(c);
+
+  output[0] = get_lane(vshl_n_s32(d, 1));
+  output[1] = 0;
+}
+
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
+  int r;
+  int16x8_t sum = vld1q_s16(&input[0]);
+
+  for (r = 1; r < 8; ++r) {
+    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+    sum = vaddq_s16(sum, input_00);
+  }
+
+  output[0] = get_lane(horizontal_add_int16x8(sum));
+  output[1] = 0;
+}
+
+void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t left = vld1q_s16(input);
+  int16x8_t right = vld1q_s16(input + 8);
+  int32x2_t sum;
+  input += stride;
+
+  for (r = 1; r < 16; ++r) {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    left = vaddq_s16(left, a);
+    right = vaddq_s16(right, b);
+  }
+
+  sum = vadd_s32(horizontal_add_int16x8(left), horizontal_add_int16x8(right));
+
+  output[0] = get_lane(vshr_n_s32(sum, 1));
+  output[1] = 0;
+}
+
+void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  int r;
+  int16x8_t a0 = vld1q_s16(input);
+  int16x8_t a1 = vld1q_s16(input + 8);
+  int16x8_t a2 = vld1q_s16(input + 16);
+  int16x8_t a3 = vld1q_s16(input + 24);
+  int32x2_t sum;
+  input += stride;
+
+  for (r = 1; r < 32; ++r) {
+    const int16x8_t b0 = vld1q_s16(input);
+    const int16x8_t b1 = vld1q_s16(input + 8);
+    const int16x8_t b2 = vld1q_s16(input + 16);
+    const int16x8_t b3 = vld1q_s16(input + 24);
+    input += stride;
+    a0 = vaddq_s16(a0, b0);
+    a1 = vaddq_s16(a1, b1);
+    a2 = vaddq_s16(a2, b2);
+    a3 = vaddq_s16(a3, b3);
+  }
+
+  sum = vadd_s32(horizontal_add_int16x8(a0), horizontal_add_int16x8(a1));
+  sum = vadd_s32(sum, horizontal_add_int16x8(a2));
+  sum = vadd_s32(sum, horizontal_add_int16x8(a3));
+  output[0] = get_lane(vshr_n_s32(sum, 3));
+  output[1] = 0;
+}
diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index c449b4660..8049277b1 100644
--- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -48,18 +48,18 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
     int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
     int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -77,10 +77,10 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     // Stage 2
     v_x0 = vsubq_s16(v_s6, v_s5);
     v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -95,22 +95,22 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
       v_x3 = vaddq_s16(v_s7, cd);
     }
     // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
     {
       const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
       const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
@@ -207,24 +207,3 @@ void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
     store_s16q_to_tran_low(final_output + 7 * 8, input_7);
   }
 }
-
-void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
-  int r;
-  int16x8_t sum = vld1q_s16(&input[0]);
-  for (r = 1; r < 8; ++r) {
-    const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
-    sum = vaddq_s16(sum, input_00);
-  }
-  {
-    const int32x4_t a = vpaddlq_s16(sum);
-    const int64x2_t b = vpaddlq_s32(a);
-    const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                                 vreinterpret_s32_s64(vget_high_s64(b)));
-#if CONFIG_VP9_HIGHBITDEPTH
-    output[0] = vget_lane_s32(c, 0);
-#else
-    output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
-#endif
-    output[1] = 0;
-  }
-}
diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c
index 79bedd848..523a63c6f 100644
--- a/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -47,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
   *a7 = vaddq_s16(c1, c5);
 }
 
-void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
                            tran_low_t *coeff) {
   int16x8_t a0 = vld1q_s16(src_diff);
   int16x8_t a1 = vld1q_s16(src_diff + src_stride);
@@ -76,7 +76,7 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
   store_s16q_to_tran_low(coeff + 56, a7);
 }
 
-void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
   int i;
 
diff --git a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
index 98e42cd25..5358839b5 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -1410,10 +1410,10 @@ static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
 
 void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
   const int16x8_t dc = vdupq_n_s16(a1);
   int i;
diff --git a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
index 63eb49678..c1354c0c1 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -61,10 +61,10 @@ static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
 
 void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
   const int16x8_t dc = vdupq_n_s16(a1);
   int i;
diff --git a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 20b09f683..1418a75a1 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -54,10 +54,10 @@ static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest,
 void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
   const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
   const int16x8_t dc = vdupq_n_s16(a1);
 
diff --git a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index 6687e7649..dd90134a6 100644
--- a/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -38,10 +38,10 @@ static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
 
 void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
-  const tran_low_t out0 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  const tran_low_t out1 =
-      HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+  const tran_low_t out0 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  const tran_low_t out1 = HIGHBD_WRAPLOW(
+      dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd);
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
   const int16x8_t dc = vdupq_n_s16(a1);
 
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index 74345e1fa..c46c01631 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -17,8 +17,9 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
-                            int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) {
+static INLINE void load_4x4(const int16_t *s, const ptrdiff_t p,
+                            int16x4_t *const s0, int16x4_t *const s1,
+                            int16x4_t *const s2, int16x4_t *const s3) {
   *s0 = vld1_s16(s);
   s += p;
   *s1 = vld1_s16(s);
@@ -28,8 +29,9 @@ static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0,
   *s3 = vld1_s16(s);
 }
 
-static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
-                            uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) {
+static INLINE void load_8x4(const uint16_t *s, const ptrdiff_t p,
+                            uint16x8_t *const s0, uint16x8_t *const s1,
+                            uint16x8_t *const s2, uint16x8_t *const s3) {
   *s0 = vld1q_u16(s);
   s += p;
   *s1 = vld1q_u16(s);
@@ -39,10 +41,11 @@ static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0,
   *s3 = vld1q_u16(s);
 }
 
-static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
-                            int16x8_t *s1, int16x8_t *s2, int16x8_t *s3,
-                            int16x8_t *s4, int16x8_t *s5, int16x8_t *s6,
-                            int16x8_t *s7) {
+static INLINE void load_8x8(const int16_t *s, const ptrdiff_t p,
+                            int16x8_t *const s0, int16x8_t *const s1,
+                            int16x8_t *const s2, int16x8_t *const s3,
+                            int16x8_t *const s4, int16x8_t *const s5,
+                            int16x8_t *const s6, int16x8_t *const s7) {
   *s0 = vld1q_s16(s);
   s += p;
   *s1 = vld1q_s16(s);
@@ -60,11 +63,11 @@ static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0,
   *s7 = vld1q_s16(s);
 }
 
-static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
-                             const uint16x8_t s1, const uint16x8_t s2,
-                             const uint16x8_t s3, const uint16x8_t s4,
-                             const uint16x8_t s5, const uint16x8_t s6,
-                             const uint16x8_t s7) {
+static INLINE void store_8x8(uint16_t *s, const ptrdiff_t p,
+                             const uint16x8_t s0, const uint16x8_t s1,
+                             const uint16x8_t s2, const uint16x8_t s3,
+                             const uint16x8_t s4, const uint16x8_t s5,
+                             const uint16x8_t s6, const uint16x8_t s7) {
   vst1q_u16(s, s0);
   s += p;
   vst1q_u16(s, s1);
@@ -82,16 +85,15 @@ static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0,
   vst1q_u16(s, s7);
 }
 
-static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
-                                    const int16x4_t s2, const int16x4_t s3,
-                                    const int16x4_t s4, const int16x4_t s5,
-                                    const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters) {
+static INLINE int32x4_t highbd_convolve8_4(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sum;
 
-  sum = vmlal_lane_s16(sum, s0, filters_lo, 0);
+  sum = vmull_lane_s16(s0, filters_lo, 0);
   sum = vmlal_lane_s16(sum, s1, filters_lo, 1);
   sum = vmlal_lane_s16(sum, s2, filters_lo, 2);
   sum = vmlal_lane_s16(sum, s3, filters_lo, 3);
@@ -102,19 +104,17 @@ static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   return sum;
 }
 
-static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
-                                     const int16x8_t s2, const int16x8_t s3,
-                                     const int16x8_t s4, const int16x8_t s5,
-                                     const int16x8_t s6, const int16x8_t s7,
-                                     const int16x8_t filters,
-                                     const uint16x8_t max) {
+static INLINE uint16x8_t
+highbd_convolve8_8(const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+                   const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+                   const int16x8_t s6, const int16x8_t s7,
+                   const int16x8_t filters, const uint16x8_t max) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
-  int32x4_t sum0 = vdupq_n_s32(0);
-  int32x4_t sum1 = vdupq_n_s32(0);
+  int32x4_t sum0, sum1;
   uint16x8_t d;
 
-  sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0);
+  sum0 = vmull_lane_s16(vget_low_s16(s0), filters_lo, 0);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3);
@@ -122,7 +122,7 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2);
   sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3);
-  sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0);
+  sum1 = vmull_lane_s16(vget_high_s16(s0), filters_lo, 0);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2);
   sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3);
@@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
 
 void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
                                      uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y,  // unused
-                                     int y_step_q4,            // unused
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
   if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
   } else {
-    const int16x8_t filters = vld1q_s16(filter_x);
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     uint16x8_t t0, t1, t2, t3;
 
@@ -182,10 +181,10 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
         load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
         transpose_s16_4x4d(&s7, &s8, &s9, &s10);
 
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
         d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -241,10 +240,11 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
           __builtin_prefetch(src + 5 * src_stride);
           __builtin_prefetch(src + 6 * src_stride);
           __builtin_prefetch(src + 7 * src_stride);
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
 
           transpose_u16_8x4(&d0, &d1, &d2, &d3);
           vst1_u16(dst, vget_low_u16(d0));
@@ -302,14 +302,22 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
                      &s12, &s13, &s14);
             transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
 
-            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
-            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
-            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
-            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+                                    max);
+            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+                                    max);
+            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+                                    max);
+            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+                                    max);
+            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+                                    max);
+            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+                                    max);
+            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+                                    max);
+            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                    filters, max);
 
             transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
             store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
@@ -337,15 +345,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride,
 void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
                                          ptrdiff_t src_stride, uint16_t *dst,
                                          ptrdiff_t dst_stride,
-                                         const int16_t *filter_x, int x_step_q4,
-                                         const int16_t *filter_y,  // unused
-                                         int y_step_q4,            // unused
-                                         int w, int h, int bd) {
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h, int bd) {
   if (x_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
   } else {
-    const int16x8_t filters = vld1q_s16(filter_x);
+    const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
     uint16x8_t t0, t1, t2, t3;
 
@@ -382,10 +390,10 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
         load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10);
         transpose_s16_4x4d(&s7, &s8, &s9, &s10);
 
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
         t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -448,10 +456,11 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
           __builtin_prefetch(src + 5 * src_stride);
           __builtin_prefetch(src + 6 * src_stride);
           __builtin_prefetch(src + 7 * src_stride);
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
           transpose_u16_8x4(&t0, &t1, &t2, &t3);
 
           d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride),
@@ -522,14 +531,22 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
                      &s12, &s13, &s14);
             transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14);
 
-            d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-            d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-            d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-            d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
-            d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max);
-            d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max);
-            d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max);
-            d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max);
+            d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters,
+                                    max);
+            d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters,
+                                    max);
+            d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters,
+                                    max);
+            d3 = highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters,
+                                    max);
+            d4 = highbd_convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters,
+                                    max);
+            d5 = highbd_convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters,
+                                    max);
+            d6 = highbd_convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters,
+                                    max);
+            d7 = highbd_convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14,
+                                    filters, max);
 
             transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
 
@@ -566,15 +583,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
 
 void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
                                     uint16_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x,  // unused
-                                    int x_step_q4,            // unused
-                                    const int16_t *filter_y, int y_step_q4,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
                                     int w, int h, int bd) {
   if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h, bd);
   } else {
-    const int16x8_t filters = vld1q_s16(filter_y);
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
     assert(!((intptr_t)dst & 3));
@@ -620,10 +636,10 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 1 * src_stride);
         __builtin_prefetch(src + 2 * src_stride);
         __builtin_prefetch(src + 3 * src_stride);
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
         d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -698,10 +714,11 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
           __builtin_prefetch(s + 1 * src_stride);
           __builtin_prefetch(s + 2 * src_stride);
           __builtin_prefetch(s + 3 * src_stride);
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          d0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          d1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          d2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          d3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
 
           vst1q_u16(d, d0);
           d += dst_stride;
@@ -732,15 +749,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride,
 void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
                                         ptrdiff_t src_stride, uint16_t *dst,
                                         ptrdiff_t dst_stride,
-                                        const int16_t *filter_x,  // unused
-                                        int x_step_q4,            // unused
-                                        const int16_t *filter_y, int y_step_q4,
+                                        const InterpKernel *filter, int x0_q4,
+                                        int x_step_q4, int y0_q4, int y_step_q4,
                                         int w, int h, int bd) {
   if (y_step_q4 != 16) {
-    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                    x_step_q4, filter_y, y_step_q4, w, h, bd);
+    vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                    bd);
   } else {
-    const int16x8_t filters = vld1q_s16(filter_y);
+    const int16x8_t filters = vld1q_s16(filter[y0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
 
     assert(!((intptr_t)dst & 3));
@@ -786,10 +803,10 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
         __builtin_prefetch(src + 1 * src_stride);
         __builtin_prefetch(src + 2 * src_stride);
         __builtin_prefetch(src + 3 * src_stride);
-        d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+        d0 = highbd_convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = highbd_convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = highbd_convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = highbd_convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7));
         t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7));
@@ -872,10 +889,11 @@ void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src,
           __builtin_prefetch(s + 1 * src_stride);
           __builtin_prefetch(s + 2 * src_stride);
           __builtin_prefetch(s + 3 * src_stride);
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
+          t0 = highbd_convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max);
+          t1 = highbd_convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max);
+          t2 = highbd_convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max);
+          t3 =
+              highbd_convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max);
 
           d0 = vld1q_u16(d + 0 * dst_stride);
           d1 = vld1q_u16(d + 1 * dst_stride);
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
index 4ff3dea08..765a054f8 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c
@@ -15,13 +15,14 @@
 
 void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h, int bd) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   if (w < 8) {  // avg4
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 61712d48e..9d2752e09 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@
 
 void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h, int bd) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   if (w < 8) {  // copy4
diff --git a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
index f769620a4..414ade353 100644
--- a/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
+++ b/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c
@@ -15,12 +15,11 @@
 
 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
                                uint16_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h, int bd) {
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
   // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  uint16_t temp[64 * 136];
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
@@ -29,22 +28,21 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride,
    * buffer which has lots of extra room and is subsequently discarded this is
    * safe if somewhat less than ideal.   */
   vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                                   intermediate_height, bd);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h, int bd) {
-  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
   // + 1 to make it divisible by 4
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
+  uint16_t temp[64 * 136];
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
@@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
    * to average the values after both passes.
    */
   vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
+                                  filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                                   intermediate_height, bd);
-  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
+  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter,
+                                     x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,
+                                     bd);
 }
diff --git a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
index 968bc5cc3..bf5192a68 100644
--- a/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c
@@ -32,7 +32,8 @@ static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest,
                               int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
diff --git a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
index 604d82abd..8920b9336 100644
--- a/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c
@@ -39,7 +39,8 @@ static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride,
 void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   int i;
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
 
diff --git a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
index 21d21b033..a14b89543 100644
--- a/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c
@@ -32,7 +32,8 @@ static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
                             int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
   const int16x8_t dc = vdupq_n_s16(a1);
diff --git a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
index 7bcce913b..ce9b45958 100644
--- a/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
+++ b/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c
@@ -36,7 +36,8 @@ static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride,
 
 void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest,
                             int stride) {
-  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  const int16_t out0 =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
   const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
   const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5);
 
diff --git a/libvpx/vpx_dsp/arm/idct_neon.h b/libvpx/vpx_dsp/arm/idct_neon.h
index 0fc1de8e4..6ed02af5a 100644
--- a/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/libvpx/vpx_dsp/arm/idct_neon.h
@@ -18,7 +18,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
+static const int16_t kCospi[16] = {
   16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
   11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
   16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
@@ -29,7 +29,7 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
   12665 /*  cospi_14_64 */, -10394 /* -cospi_18_64 */
 };
 
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
+static const int32_t kCospi32[16] = {
   16384 /*  cospi_0_64  */, 15137 /*  cospi_8_64  */,
   11585 /*  cospi_16_64 */, 6270 /*  cospi_24_64 */,
   16069 /*  cospi_4_64  */, 13623 /*  cospi_12_64 */,
diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h
index 37b89b276..4efad5333 100644
--- a/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/libvpx/vpx_dsp/arm/mem_neon.h
@@ -79,6 +79,32 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
   memcpy(buf, &a, 4);
 }
 
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32 = vdup_n_u32(0);
+  if (stride == 4) return vld1_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vld1_lane_u32(&a, a_u32, 0);
+  memcpy(&a, buf, 4);
+  a_u32 = vld1_lane_u32(&a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Store 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE void store_unaligned_u8(uint8_t *buf, int stride,
+                                      const uint8x8_t a) {
+  const uint32x2_t a_u32 = vreinterpret_u32_u8(a);
+  if (stride == 4) {
+    vst1_u8(buf, a);
+    return;
+  }
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 0));
+  buf += stride;
+  uint32_to_mem(buf, vget_lane_u32(a_u32, 1));
+}
+
 // Load 4 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
   uint32_t a;
diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c
new file mode 100644
index 000000000..a0a1e6dd5
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         int skip_block, const int16_t *zbin_ptr,
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Process first 8 values which include a dc component.
+  {
+    // Only the first element of each vector is DC.
+    const int16x8_t zbin = vld1q_s16(zbin_ptr);
+    const int16x8_t round = vld1q_s16(round_ptr);
+    const int16x8_t quant = vld1q_s16(quant_ptr);
+    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+    const int16x8_t dequant = vld1q_s16(dequant_ptr);
+    // Add one because the eob does not index from 0.
+    const uint16x8_t iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+    const int16x8_t zbin_mask =
+        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+    qcoeff = vaddq_s16(qcoeff, rounded);
+
+    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+    // Restore the sign bit.
+    qcoeff = veorq_s16(qcoeff, coeff_sign);
+    qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+    qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+    coeff_ptr += 8;
+    iscan_ptr += 8;
+
+    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+    qcoeff_ptr += 8;
+
+    qcoeff = vmulq_s16(qcoeff, dequant);
+
+    store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
+    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
+    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+    do {
+      // Add one because the eob is not its index.
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+      const int16x8_t zbin_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+      qcoeff = vaddq_s16(qcoeff, rounded);
+
+      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+      // Restore the sign bit.
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+      qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      coeff_ptr += 8;
+      iscan_ptr += 8;
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      qcoeff_ptr += 8;
+
+      qcoeff = vmulq_s16(qcoeff, dequant);
+
+      store_s16q_to_tran_low(dqcoeff_ptr, qcoeff);
+      dqcoeff_ptr += 8;
+
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+}
+
+static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+// Main difference is that zbin values are halved before comparison and dqcoeff
+// values are divided by 2. zbin is rounded but dqcoeff is not.
+void vpx_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+  (void)scan_ptr;
+  (void)n_coeffs;  // Because we will always calculate 32*32.
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Process first 8 values which include a dc component.
+  {
+    // Only the first element of each vector is DC.
+    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+    const int16x8_t quant = vld1q_s16(quant_ptr);
+    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+    const int16x8_t dequant = vld1q_s16(dequant_ptr);
+    // Add one because the eob does not index from 0.
+    const uint16x8_t iscan =
+        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+    const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+    const int16x8_t zbin_mask =
+        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+    int16x8_t dqcoeff;
+    int32x4_t dqcoeff_0, dqcoeff_1;
+
+    qcoeff = vaddq_s16(qcoeff, rounded);
+
+    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+    // Restore the sign bit.
+    qcoeff = veorq_s16(qcoeff, coeff_sign);
+    qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+    qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan);
+
+    coeff_ptr += 8;
+    iscan_ptr += 8;
+
+    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+    qcoeff_ptr += 8;
+
+    dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+    dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+    // Add 1 if negative to round towards zero because the C uses division.
+    dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+    dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+    dqcoeff =
+        vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+    store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
+    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
+    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
+    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      // Add one because the eob is not its index.
+      const uint16x8_t iscan =
+          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one));
+
+      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+      const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+      const int16x8_t zbin_mask =
+          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+      int16x8_t dqcoeff;
+      int32x4_t dqcoeff_0, dqcoeff_1;
+
+      qcoeff = vaddq_s16(qcoeff, rounded);
+
+      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+      // Restore the sign bit.
+      qcoeff = veorq_s16(qcoeff, coeff_sign);
+      qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+      qcoeff = vandq_s16(qcoeff, zbin_mask);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan));
+
+      coeff_ptr += 8;
+      iscan_ptr += 8;
+
+      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+      qcoeff_ptr += 8;
+
+      dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
+      dqcoeff_1 = vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
+
+      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+      dqcoeff =
+          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1));
+
+      store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+      dqcoeff_ptr += 8;
+    }
+  }
+
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+}
diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
index dc2039800..b04de3aff 100644
--- a/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -13,212 +13,230 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-
-// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16,
-// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo
-// and vec_sum_ref_hi.
-static void sad_neon_64(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16,
-                        const uint8x16_t vec_src_32,
-                        const uint8x16_t vec_src_48, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-  const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-  const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32),
-                             vget_low_u8(vec_ref_32));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32),
-                             vget_high_u8(vec_ref_32));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48),
-                             vget_low_u8(vec_ref_48));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48),
-                             vget_high_u8(vec_ref_48));
-}
-
-// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16,
-// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi.
-static void sad_neon_32(const uint8x16_t vec_src_00,
-                        const uint8x16_t vec_src_16, const uint8_t *ref,
-                        uint16x8_t *vec_sum_ref_lo,
-                        uint16x8_t *vec_sum_ref_hi) {
-  const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-  const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00),
-                             vget_low_u8(vec_ref_00));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00),
-                             vget_high_u8(vec_ref_00));
-  *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16),
-                             vget_low_u8(vec_ref_16));
-  *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16),
-                             vget_high_u8(vec_ref_16));
+void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  int i;
+  const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
+  for (i = 0; i < 4; ++i) {
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
+    uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  }
 }
 
-void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  int i;
+  const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
+  const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
+  for (i = 0; i < 4; ++i) {
+    const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
+    const uint8x16_t ref_1 =
+        load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
+    uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
+    abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
+    abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
+    abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
+    res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+  }
+}
+
+static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
+                            const uint8_t *const b[4], int b_stride,
+                            uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+    }
+  }
+
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
+}
+
+void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 4);
+}
+
+void vpx_sad8x8x4d_neon(const uint8_t *src, int src_stride,
+                        const uint8_t *const ref[4], int ref_stride,
+                        uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 8);
+}
+
+void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
+                         const uint8_t *const ref[4], int ref_stride,
+                         uint32_t *res) {
+  sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
+
+static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+    }
+  }
+
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
+}
+
+void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
+                         const uint8_t *const ref[4], int ref_stride,
+                         uint32_t *res) {
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 8);
+}
+
+void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0,
-                &vec_sum_ref0_lo, &vec_sum_ref0_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1,
-                &vec_sum_ref1_lo, &vec_sum_ref1_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2,
-                &vec_sum_ref2_lo, &vec_sum_ref2_hi);
-    sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3,
-                &vec_sum_ref3_lo, &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 16);
+}
+
+void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
+  int i, j;
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    a += a_stride;
+    for (j = 0; j < 4; ++j) {
+      const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
+      const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
+      b_loop[j] += b_stride;
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
+      sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
+      sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
+    }
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  for (j = 0; j < 4; ++j) {
+    result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+  }
+}
+
+void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
+
+void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+}
+
+static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
+                          const uint8x16_t b_0, const uint8x16_t b_1,
+                          uint16x8_t *sum) {
+  *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
+  *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
+  *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
+  *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
+}
+
+static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
+                             const uint8_t *const b[4], int b_stride,
+                             uint32_t *result, const int height) {
   int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-
-    sad_neon_32(vec_src_00, vec_src_16, ref0, &vec_sum_ref0_lo,
-                &vec_sum_ref0_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref1, &vec_sum_ref1_lo,
-                &vec_sum_ref1_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref2, &vec_sum_ref2_lo,
-                &vec_sum_ref2_hi);
-    sad_neon_32(vec_src_00, vec_src_16, ref3, &vec_sum_ref3_lo,
-                &vec_sum_ref3_hi);
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
+  uint16x8_t sum_0 = vdupq_n_u16(0);
+  uint16x8_t sum_1 = vdupq_n_u16(0);
+  uint16x8_t sum_2 = vdupq_n_u16(0);
+  uint16x8_t sum_3 = vdupq_n_u16(0);
+  uint16x8_t sum_4 = vdupq_n_u16(0);
+  uint16x8_t sum_5 = vdupq_n_u16(0);
+  uint16x8_t sum_6 = vdupq_n_u16(0);
+  uint16x8_t sum_7 = vdupq_n_u16(0);
+  const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    a += a_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
+           &sum_1);
+    b_loop[0] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
+           &sum_3);
+    b_loop[1] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
+           &sum_5);
+    b_loop[2] += b_stride;
+    sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
+    sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
+           &sum_7);
+    b_loop[3] += b_stride;
   }
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+  result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
+  result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
+  result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
+  result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
 }
 
-void vpx_sad16x16x4d_neon(const uint8_t *src, int src_stride,
+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
                           const uint8_t *const ref[4], int ref_stride,
                           uint32_t *res) {
-  int i;
-  uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0);
-  const uint8_t *ref0, *ref1, *ref2, *ref3;
-  ref0 = ref[0];
-  ref1 = ref[1];
-  ref2 = ref[2];
-  ref3 = ref[3];
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref0 = vld1q_u8(ref0);
-    const uint8x16_t vec_ref1 = vld1q_u8(ref1);
-    const uint8x16_t vec_ref2 = vld1q_u8(ref2);
-    const uint8x16_t vec_ref3 = vld1q_u8(ref3);
-
-    vec_sum_ref0_lo =
-        vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref0));
-    vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref0));
-    vec_sum_ref1_lo =
-        vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref1));
-    vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref1));
-    vec_sum_ref2_lo =
-        vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref2));
-    vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref2));
-    vec_sum_ref3_lo =
-        vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref3));
-    vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src),
-                               vget_high_u8(vec_ref3));
-
-    src += src_stride;
-    ref0 += ref_stride;
-    ref1 += ref_stride;
-    ref2 += ref_stride;
-    ref3 += ref_stride;
-  }
+  sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
+}
 
-  res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi);
-  res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi);
-  res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi);
-  res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi);
+void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
+                          const uint8_t *const ref[4], int ref_stride,
+                          uint32_t *res) {
+  sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
 }
diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
index ff3228768..9518a166b 100644
--- a/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c
@@ -13,211 +13,332 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-unsigned int vpx_sad8x16_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
+uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *ref_ptr, int ref_stride) {
+  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
+  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             const uint8_t *second_pred) {
+  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
+  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
+                         const uint8_t *ref_ptr, int ref_stride) {
   int i;
+  uint16x8_t abs = vdupq_n_u16(0);
+  for (i = 0; i < 8; i += 4) {
+    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
+  }
 
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
 
-  for (i = 0; i < 15; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
+uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride,
+                             const uint8_t *second_pred) {
+  int i;
+  uint16x8_t abs = vdupq_n_u16(0);
+  for (i = 0; i < 8; i += 4) {
+    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
+    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    second_pred += 16;
+    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
+    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   }
 
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
+  return vget_lane_u32(horizontal_add_uint16x8(abs), 0);
+}
+
+static INLINE uint16x8_t sad8x(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, const int height) {
+  int i;
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  return vget_lane_u32(d5, 0);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    const uint8x8_t b_u8 = vld1_u8(b);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, a_u8, b_u8);
+  }
+  return abs;
 }
 
-unsigned int vpx_sad4x4_neon(unsigned char *src_ptr, int src_stride,
-                             unsigned char *ref_ptr, int ref_stride) {
-  uint8x8_t d0, d8;
-  uint16x8_t q12;
-  uint32x2_t d1;
-  uint64x1_t d3;
+static INLINE uint16x8_t sad8x_avg(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   const uint8_t *c, const int height) {
   int i;
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  d0 = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  d8 = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(d0, d8);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(a);
+    const uint8x8_t b_u8 = vld1_u8(b);
+    const uint8x8_t c_u8 = vld1_u8(c);
+    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+    a += a_stride;
+    b += b_stride;
+    c += 8;
+    abs = vabal_u8(abs, a_u8, avg);
+  }
+  return abs;
+}
 
-  for (i = 0; i < 3; i++) {
-    d0 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d8 = vld1_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, d0, d8);
+#define sad8xN(n)                                                      \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src, int src_stride,     \
+                               const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad8x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
+  }                                                                    \
+                                                                       \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                   const uint8_t *ref, int ref_stride, \
+                                   const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                             \
+        sad8x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);             \
   }
 
-  d1 = vpaddl_u16(vget_low_u16(q12));
-  d3 = vpaddl_u32(d1);
+sad8xN(4);
+sad8xN(8);
+sad8xN(16);
+
+static INLINE uint16x8_t sad16x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
+  int i;
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  return vget_lane_u32(vreinterpret_u32_u64(d3), 0);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    const uint8x16_t b_u8 = vld1q_u8(b);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
+    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
+  }
+  return abs;
 }
 
-unsigned int vpx_sad16x8_neon(unsigned char *src_ptr, int src_stride,
-                              unsigned char *ref_ptr, int ref_stride) {
-  uint8x16_t q0, q4;
-  uint16x8_t q12, q13;
-  uint32x4_t q1;
-  uint64x2_t q3;
-  uint32x2_t d5;
+static INLINE uint16x8_t sad16x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
   int i;
+  uint16x8_t abs = vdupq_n_u16(0);
 
-  q0 = vld1q_u8(src_ptr);
-  src_ptr += src_stride;
-  q4 = vld1q_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  q12 = vabdl_u8(vget_low_u8(q0), vget_low_u8(q4));
-  q13 = vabdl_u8(vget_high_u8(q0), vget_high_u8(q4));
-
-  for (i = 0; i < 7; i++) {
-    q0 = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    q4 = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    q12 = vabal_u8(q12, vget_low_u8(q0), vget_low_u8(q4));
-    q13 = vabal_u8(q13, vget_high_u8(q0), vget_high_u8(q4));
-  }
-
-  q12 = vaddq_u16(q12, q13);
-  q1 = vpaddlq_u16(q12);
-  q3 = vpaddlq_u32(q1);
-  d5 = vadd_u32(vreinterpret_u32_u64(vget_low_u64(q3)),
-                vreinterpret_u32_u64(vget_high_u64(q3)));
-
-  return vget_lane_u32(d5, 0);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(a);
+    const uint8x16_t b_u8 = vld1q_u8(b);
+    const uint8x16_t c_u8 = vld1q_u8(c);
+    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+    a += a_stride;
+    b += b_stride;
+    c += 16;
+    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
+    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
+  }
+  return abs;
 }
 
-static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo,
-                                                    const uint16x8_t vec_hi) {
-  const uint32x4_t vec_l_lo =
-      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
-  const uint32x4_t vec_l_hi =
-      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
-  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
-static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) {
-  const uint32x4_t a = vpaddlq_u16(vec_16x8);
-  const uint64x2_t b = vpaddlq_u32(a);
-  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
-                                vreinterpret_u32_u64(vget_high_u64(b)));
-  return vget_lane_u32(c, 0);
-}
+#define sad16xN(n)                                                      \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad16x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                              \
+        sad16x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }
+
+sad16xN(8);
+sad16xN(16);
+sad16xN(32);
 
-unsigned int vpx_sad64x64_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad32x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-  for (i = 0; i < 64; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_src_32 = vld1q_u8(src + 32);
-    const uint8x16_t vec_src_48 = vld1q_u8(src + 48);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32);
-    const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32),
-                            vget_low_u8(vec_ref_32));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32),
-                            vget_high_u8(vec_ref_32));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48),
-                            vget_low_u8(vec_ref_48));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48),
-                            vget_high_u8(vec_ref_48));
-  }
-  return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi);
+  uint16x8_t abs = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(a);
+    const uint8x16_t a_hi = vld1q_u8(a + 16);
+    const uint8x16_t b_lo = vld1q_u8(b);
+    const uint8x16_t b_hi = vld1q_u8(b + 16);
+    a += a_stride;
+    b += b_stride;
+    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
+    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
+    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
+    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
+  }
+  return abs;
 }
 
-unsigned int vpx_sad32x32_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+static INLINE uint16x8_t sad32x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 32; ++i) {
-    const uint8x16_t vec_src_00 = vld1q_u8(src);
-    const uint8x16_t vec_src_16 = vld1q_u8(src + 16);
-    const uint8x16_t vec_ref_00 = vld1q_u8(ref);
-    const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00),
-                            vget_low_u8(vec_ref_00));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00),
-                            vget_high_u8(vec_ref_00));
-    vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16),
-                            vget_low_u8(vec_ref_16));
-    vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16),
-                            vget_high_u8(vec_ref_16));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  uint16x8_t abs = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(a);
+    const uint8x16_t a_hi = vld1q_u8(a + 16);
+    const uint8x16_t b_lo = vld1q_u8(b);
+    const uint8x16_t b_hi = vld1q_u8(b + 16);
+    const uint8x16_t c_lo = vld1q_u8(c);
+    const uint8x16_t c_hi = vld1q_u8(c + 16);
+    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+    a += a_stride;
+    b += b_stride;
+    c += 32;
+    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
+    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
+    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
+    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
+  }
+  return abs;
 }
 
-unsigned int vpx_sad16x16_neon(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride) {
+#define sad32xN(n)                                                      \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint16x8_t abs = sad32x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint16x8_t abs =                                              \
+        sad32x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint16x8(abs), 0);              \
+  }
+
+sad32xN(16);
+sad32xN(32);
+sad32xN(64);
+
+static INLINE uint32x4_t sad64x(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride,
+                                const int height) {
   int i;
-  uint16x8_t vec_accum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_accum_hi = vdupq_n_u16(0);
-
-  for (i = 0; i < 16; ++i) {
-    const uint8x16_t vec_src = vld1q_u8(src);
-    const uint8x16_t vec_ref = vld1q_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum_lo =
-        vabal_u8(vec_accum_lo, vget_low_u8(vec_src), vget_low_u8(vec_ref));
-    vec_accum_hi =
-        vabal_u8(vec_accum_hi, vget_high_u8(vec_src), vget_high_u8(vec_ref));
-  }
-  return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi));
+  uint16x8_t abs_0 = vdupq_n_u16(0);
+  uint16x8_t abs_1 = vdupq_n_u16(0);
+
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    const uint8x16_t b_0 = vld1q_u8(b);
+    const uint8x16_t b_1 = vld1q_u8(b + 16);
+    const uint8x16_t b_2 = vld1q_u8(b + 32);
+    const uint8x16_t b_3 = vld1q_u8(b + 48);
+    a += a_stride;
+    b += b_stride;
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
+  }
+
+  {
+    const uint32x4_t sum = vpaddlq_u16(abs_0);
+    return vpadalq_u16(sum, abs_1);
+  }
 }
 
-unsigned int vpx_sad8x8_neon(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride) {
+static INLINE uint32x4_t sad64x_avg(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    const uint8_t *c, const int height) {
   int i;
-  uint16x8_t vec_accum = vdupq_n_u16(0);
+  uint16x8_t abs_0 = vdupq_n_u16(0);
+  uint16x8_t abs_1 = vdupq_n_u16(0);
 
-  for (i = 0; i < 8; ++i) {
-    const uint8x8_t vec_src = vld1_u8(src);
-    const uint8x8_t vec_ref = vld1_u8(ref);
-    src += src_stride;
-    ref += ref_stride;
-    vec_accum = vabal_u8(vec_accum, vec_src, vec_ref);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(a);
+    const uint8x16_t a_1 = vld1q_u8(a + 16);
+    const uint8x16_t a_2 = vld1q_u8(a + 32);
+    const uint8x16_t a_3 = vld1q_u8(a + 48);
+    const uint8x16_t b_0 = vld1q_u8(b);
+    const uint8x16_t b_1 = vld1q_u8(b + 16);
+    const uint8x16_t b_2 = vld1q_u8(b + 32);
+    const uint8x16_t b_3 = vld1q_u8(b + 48);
+    const uint8x16_t c_0 = vld1q_u8(c);
+    const uint8x16_t c_1 = vld1q_u8(c + 16);
+    const uint8x16_t c_2 = vld1q_u8(c + 32);
+    const uint8x16_t c_3 = vld1q_u8(c + 48);
+    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+    a += a_stride;
+    b += b_stride;
+    c += 64;
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
+    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
+    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
+    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
+    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
+  }
+
+  {
+    const uint32x4_t sum = vpaddlq_u16(abs_0);
+    return vpadalq_u16(sum, abs_1);
   }
-  return horizontal_add_16x8(vec_accum);
 }
+
+#define sad64xN(n)                                                      \
+  uint32_t vpx_sad64x##n##_neon(const uint8_t *src, int src_stride,     \
+                                const uint8_t *ref, int ref_stride) {   \
+    const uint32x4_t abs = sad64x(src, src_stride, ref, ref_stride, n); \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+  }                                                                     \
+                                                                        \
+  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    const uint8_t *second_pred) {       \
+    const uint32x4_t abs =                                              \
+        sad64x_avg(src, src_stride, ref, ref_stride, second_pred, n);   \
+    return vget_lane_u32(horizontal_add_uint32x4(abs), 0);              \
+  }
+
+sad64xN(32);
+sad64xN(64);
diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index 9b1622ff0..4f58a7832 100644
--- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -12,16 +12,39 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
-#include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
 #include "vpx_dsp/variance.h"
+#include "vpx_dsp/arm/mem_neon.h"
 
 static const uint8_t bilinear_filters[8][2] = {
   { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
   { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
 };
 
+// Process a block exactly 4 wide and a multiple of 2 high.
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                      uint8_t *output_ptr,
+                                      unsigned int src_pixels_per_line,
+                                      int pixel_step,
+                                      unsigned int output_height,
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
+  unsigned int i;
+  for (i = 0; i < output_height; i += 2) {
+    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
+    const uint8x8_t src_1 =
+        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
+    const uint16x8_t a = vmull_u8(src_0, f0);
+    const uint16x8_t b = vmlal_u8(a, src_1, f1);
+    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
+    vst1_u8(output_ptr, out);
+    src_ptr += 2 * src_pixels_per_line;
+    output_ptr += 8;
+  }
+}
+
 // Process a block exactly 8 wide and any height.
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
@@ -29,8 +52,8 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       int pixel_step,
                                       unsigned int output_height,
                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -38,8 +61,7 @@ static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
     const uint16x8_t a = vmull_u8(src_0, f0);
     const uint16x8_t b = vmlal_u8(a, src_1, f1);
     const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
+    vst1_u8(output_ptr, out);
     src_ptr += src_pixels_per_line;
     output_ptr += 8;
   }
@@ -53,8 +75,8 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                        unsigned int output_height,
                                        unsigned int output_width,
                                        const uint8_t *filter) {
-  const uint8x8_t f0 = vmov_n_u8(filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(filter[1]);
+  const uint8x8_t f0 = vdup_n_u8(filter[0]);
+  const uint8x8_t f1 = vdup_n_u8(filter[1]);
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; j += 16) {
@@ -66,36 +88,43 @@ static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
       const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
       const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
       const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(&output_ptr[j], vcombine_u8(out_lo, out_hi));
+      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
     }
-    // Next row...
     src_ptr += src_pixels_per_line;
     output_ptr += output_width;
   }
 }
 
-// TODO(johannkoenig): support 4xM block sizes.
-#define sub_pixel_varianceNxM(n, m)                                      \
-  unsigned int vpx_sub_pixel_variance##n##x##m##_neon(                   \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
-      const uint8_t *dst, int dst_stride, unsigned int *sse) {           \
-    DECLARE_ALIGNED(16, uint8_t, fdata3[n * (m + 1)]);                   \
-    DECLARE_ALIGNED(16, uint8_t, temp2[n * m]);                          \
-                                                                         \
-    if (n == 8) {                                                        \
-      var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, (m + 1),     \
-                                bilinear_filters[xoffset]);              \
-      var_filter_block2d_bil_w8(fdata3, temp2, n, n, m,                  \
-                                bilinear_filters[yoffset]);              \
-    } else {                                                             \
-      var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[xoffset]);             \
-      var_filter_block2d_bil_w16(fdata3, temp2, n, n, m, n,              \
-                                 bilinear_filters[yoffset]);             \
-    }                                                                    \
-    return vpx_variance##n##x##m(temp2, n, dst, dst_stride, sse);        \
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_varianceNxM(n, m)                                 \
+  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                  \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+    return vpx_variance##n##x##m(temp1, n, b, b_stride, sse);       \
   }
 
+sub_pixel_varianceNxM(4, 4);
+sub_pixel_varianceNxM(4, 8);
 sub_pixel_varianceNxM(8, 4);
 sub_pixel_varianceNxM(8, 8);
 sub_pixel_varianceNxM(8, 16);
@@ -107,3 +136,49 @@ sub_pixel_varianceNxM(32, 32);
 sub_pixel_varianceNxM(32, 64);
 sub_pixel_varianceNxM(64, 32);
 sub_pixel_varianceNxM(64, 64);
+
+// 4xM filter writes an extra row to fdata because it processes two rows at a
+// time.
+#define sub_pixel_avg_varianceNxM(n, m)                             \
+  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(              \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse,                \
+      const uint8_t *second_pred) {                                 \
+    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                      \
+    uint8_t temp1[n * m];                                           \
+                                                                    \
+    if (n == 4) {                                                   \
+      var_filter_block2d_bil_w4(a, temp0, a_stride, 1, (m + 2),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else if (n == 8) {                                            \
+      var_filter_block2d_bil_w8(a, temp0, a_stride, 1, (m + 1),     \
+                                bilinear_filters[xoffset]);         \
+      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,              \
+                                bilinear_filters[yoffset]);         \
+    } else {                                                        \
+      var_filter_block2d_bil_w16(a, temp0, a_stride, 1, (m + 1), n, \
+                                 bilinear_filters[xoffset]);        \
+      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,          \
+                                 bilinear_filters[yoffset]);        \
+    }                                                               \
+                                                                    \
+    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);          \
+                                                                    \
+    return vpx_variance##n##x##m(temp0, n, b, b_stride, sse);       \
+  }
+
+sub_pixel_avg_varianceNxM(4, 4);
+sub_pixel_avg_varianceNxM(4, 8);
+sub_pixel_avg_varianceNxM(8, 4);
+sub_pixel_avg_varianceNxM(8, 8);
+sub_pixel_avg_varianceNxM(8, 16);
+sub_pixel_avg_varianceNxM(16, 8);
+sub_pixel_avg_varianceNxM(16, 16);
+sub_pixel_avg_varianceNxM(16, 32);
+sub_pixel_avg_varianceNxM(32, 16);
+sub_pixel_avg_varianceNxM(32, 32);
+sub_pixel_avg_varianceNxM(32, 64);
+sub_pixel_avg_varianceNxM(64, 32);
+sub_pixel_avg_varianceNxM(64, 64);
diff --git a/libvpx/vpx_dsp/arm/sum_neon.h b/libvpx/vpx_dsp/arm/sum_neon.h
new file mode 100644
index 000000000..d74fe0cde
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/sum_neon.h
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_ARM_SUM_NEON_H_
+#define VPX_DSP_ARM_SUM_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE int32x2_t horizontal_add_int16x8(const int16x8_t a) {
+  const int32x4_t b = vpaddlq_s16(a);
+  const int64x2_t c = vpaddlq_s32(b);
+  return vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                  vreinterpret_s32_s64(vget_high_s64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                  vreinterpret_u32_u64(vget_high_u64(c)));
+}
+
+static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
+                                                      const uint16x8_t b) {
+  const uint32x4_t c = vpaddlq_u16(a);
+  const uint32x4_t d = vpadalq_u16(c, b);
+  const uint64x2_t e = vpaddlq_u32(d);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
+                  vreinterpret_u32_u64(vget_high_u64(e)));
+}
+
+static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                  vreinterpret_u32_u64(vget_high_u64(b)));
+}
+#endif  // VPX_DSP_ARM_SUM_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
index a6b2c53b7..61c2c16a7 100644
--- a/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libvpx/vpx_dsp/arm/variance_neon.c
@@ -16,23 +16,9 @@
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
 // The variance helper functions use int16_t for sum. 8 values are accumulated
 // and then added (at which point they expand up to int32_t). To avoid overflow,
 // there can be no more than 32767 / 255 ~= 128 values accumulated in each
@@ -79,8 +65,10 @@ static void variance_neon_w4x4(const uint8_t *a, int a_stride, const uint8_t *b,
     b += 4 * b_stride;
   }
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
 }
 
 // Process a block of any size where the width is divisible by 16.
@@ -126,8 +114,10 @@ static void variance_neon_w16(const uint8_t *a, int a_stride, const uint8_t *b,
     b += b_stride;
   }
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
 }
 
 // Process a block of width 8 two rows at a time.
@@ -165,8 +155,10 @@ static void variance_neon_w8x2(const uint8_t *a, int a_stride, const uint8_t *b,
     i += 2;
   } while (i < h);
 
-  *sum = horizontal_add_s16x8(sum_s16);
-  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_lo_s32, sse_hi_s32));
+  *sum = vget_lane_s32(horizontal_add_int16x8(sum_s16), 0);
+  *sse = vget_lane_u32(horizontal_add_uint32x4(vreinterpretq_u32_s32(
+                           vaddq_s32(sse_lo_s32, sse_hi_s32))),
+                       0);
 }
 
 void vpx_get8x8var_neon(const uint8_t *a, int a_stride, const uint8_t *b,
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
index e279d570f..1c2ee5063 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h
 
@@ -54,11 +55,11 @@
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h
 
-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter
 
     sub             r8, r1, r1, lsl #2      ; -src_stride * 3
     add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz
 
     sub             r2, r2, r3, lsl #2      ; reset for store
 
-    ; src[] * filter_x
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h
 
-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter
 
     lsl             r1, r1, #1
     lsl             r3, r3, #1
@@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert
     pld             [r7]
     pld             [r4]
 
-    ; src[] * filter_y
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
 
     pld             [r7, r1]
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 1386838ee..08ae17dba 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
 
 // Note:
@@ -29,43 +30,11 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-static INLINE void load_8x4(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
-                            uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-}
-
-static INLINE void load_8x8(const uint8_t *s, ptrdiff_t p, uint8x8_t *s0,
-                            uint8x8_t *s1, uint8x8_t *s2, uint8x8_t *s3,
-                            uint8x8_t *s4, uint8x8_t *s5, uint8x8_t *s6,
-                            uint8x8_t *s7) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-  s += p;
-  *s4 = vld1_u8(s);
-  s += p;
-  *s5 = vld1_u8(s);
-  s += p;
-  *s6 = vld1_u8(s);
-  s += p;
-  *s7 = vld1_u8(s);
-}
-
-static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
-                             const uint8x8_t s1, const uint8x8_t s2,
-                             const uint8x8_t s3, const uint8x8_t s4,
-                             const uint8x8_t s5, const uint8x8_t s6,
-                             const uint8x8_t s7) {
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
   vst1_u8(s, s0);
   s += p;
   vst1_u8(s, s1);
@@ -83,53 +52,12 @@ static INLINE void store_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
   vst1_u8(s, s7);
 }
 
-static INLINE int16x4_t convolve8_4(int16x4_t s0, int16x4_t s1, int16x4_t s2,
-                                    int16x4_t s3, int16x4_t s4, int16x4_t s5,
-                                    int16x4_t s6, int16x4_t s7,
-                                    int16x8_t filters, int16x4_t filter3,
-                                    int16x4_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
-  int16x4_t sum = vdup_n_s16(0);
-
-  sum = vmla_lane_s16(sum, s0, filters_lo, 0);
-  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
-  return sum;
-}
-
-static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2,
-                                    int16x8_t s3, int16x8_t s4, int16x8_t s5,
-                                    int16x8_t s6, int16x8_t s7,
-                                    int16x8_t filters, int16x8_t filter3,
-                                    int16x8_t filter4) {
-  const int16x4_t filters_lo = vget_low_s16(filters);
-  const int16x4_t filters_hi = vget_high_s16(filters);
-  int16x8_t sum = vdupq_n_s16(0);
-
-  sum = vmlaq_lane_s16(sum, s0, filters_lo, 0);
-  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
-  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
-  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
-  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
-  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
-  return sum;
-}
-
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y,  // unused
-                              int y_step_q4,            // unused
-                              int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
   assert(!((intptr_t)dst & 3));
@@ -137,8 +65,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
+  (void)y0_q4;
   (void)y_step_q4;
-  (void)filter_y;
 
   src -= 3;
 
@@ -154,7 +82,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 3 * src_stride);
     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -174,7 +102,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     src += 7;
 
     do {
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -224,11 +152,11 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
       do {
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -238,7 +166,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-        load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
         src += 8 * src_stride;
         __builtin_prefetch(dst + 0 * dst_stride);
         __builtin_prefetch(dst + 1 * dst_stride);
@@ -248,7 +177,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(dst + 5 * dst_stride);
         __builtin_prefetch(dst + 6 * dst_stride);
         __builtin_prefetch(dst + 7 * dst_stride);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -262,19 +191,15 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                          filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                          filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                          filter4);
 
-        t0 = vqrshrun_n_s16(d0, 7);
-        t1 = vqrshrun_n_s16(d1, 7);
-        t2 = vqrshrun_n_s16(d2, 7);
-        t3 = vqrshrun_n_s16(d3, 7);
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
         dst += dst_stride;
@@ -296,7 +221,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       } while (h > 0);
     } else {
       uint8_t *d;
-      int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
+      int16x8_t s11, s12, s13, s14;
 
       do {
         __builtin_prefetch(src + 0 * src_stride);
@@ -307,7 +232,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -330,7 +255,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(dst + 7 * dst_stride);
 
         do {
-          load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -341,33 +266,25 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                            filter4);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                            filter4);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                            filter4);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                            filter4);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
                            filter4);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
                            filter4);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
                            filter4);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
                            filter3, filter4);
 
-          t0 = vqrshrun_n_s16(d0, 7);
-          t1 = vqrshrun_n_s16(d1, 7);
-          t2 = vqrshrun_n_s16(d2, 7);
-          t3 = vqrshrun_n_s16(d3, 7);
-          t4 = vqrshrun_n_s16(d4, 7);
-          t5 = vqrshrun_n_s16(d5, 7);
-          t6 = vqrshrun_n_s16(d6, 7);
-          t7 = vqrshrun_n_s16(d7, 7);
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
 
           s0 = s8;
           s1 = s9;
@@ -390,11 +307,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y,  // unused
-                                  int y_step_q4,            // unused
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                   int w, int h) {
-  const int16x8_t filters = vld1q_s16(filter_x);
+  const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
   assert(!((intptr_t)dst & 3));
@@ -402,8 +318,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
+  (void)y0_q4;
   (void)y_step_q4;
-  (void)filter_y;
 
   src -= 3;
 
@@ -420,7 +336,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 3 * src_stride);
     filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
     filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
-    load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
     tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -440,7 +356,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     src += 7;
 
     do {
-      load_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
       tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
       tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -493,13 +409,13 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
       uint32x4_t d0415 = vdupq_n_u32(0);
       uint32x4_t d2637 = vdupq_n_u32(0);
       do {
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -509,7 +425,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
         s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-        load_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
         src += 8 * src_stride;
         __builtin_prefetch(dst + 0 * dst_stride);
         __builtin_prefetch(dst + 1 * dst_stride);
@@ -519,7 +436,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(dst + 5 * dst_stride);
         __builtin_prefetch(dst + 6 * dst_stride);
         __builtin_prefetch(dst + 7 * dst_stride);
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_u8_4x8(&t0, &t1, &t2, &t3, t4, t5, t6, t7);
         s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
         s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
@@ -533,19 +450,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                          filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                          filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                          filter4);
 
-        t0 = vqrshrun_n_s16(d0, 7);
-        t1 = vqrshrun_n_s16(d1, 7);
-        t2 = vqrshrun_n_s16(d2, 7);
-        t3 = vqrshrun_n_s16(d3, 7);
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
         d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
@@ -581,7 +494,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       } while (h > 0);
     } else {
       uint8_t *d;
-      int16x8_t s11, s12, s13, s14, d4, d5, d6, d7;
+      int16x8_t s11, s12, s13, s14;
       uint8x16_t d01, d23, d45, d67;
 
       do {
@@ -593,7 +506,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        load_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
         s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -616,7 +529,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(dst + 7 * dst_stride);
 
         do {
-          load_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
           s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
@@ -627,31 +540,23 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                            filter4);
-          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                            filter4);
-          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                            filter4);
-          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                            filter4);
-          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
                            filter4);
-          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
                            filter4);
-          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
                            filter4);
-          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
                            filter3, filter4);
 
-          t0 = vqrshrun_n_s16(d0, 7);
-          t1 = vqrshrun_n_s16(d1, 7);
-          t2 = vqrshrun_n_s16(d2, 7);
-          t3 = vqrshrun_n_s16(d3, 7);
-          t4 = vqrshrun_n_s16(d4, 7);
-          t5 = vqrshrun_n_s16(d5, 7);
-          t6 = vqrshrun_n_s16(d6, 7);
-          t7 = vqrshrun_n_s16(d7, 7);
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
           d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
@@ -667,9 +572,9 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
           d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
 
-          store_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                    vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                    vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
+                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
+                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
 
           s0 = s8;
           s1 = s9;
@@ -692,19 +597,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x,  // unused
-                             int x_step_q4,            // unused
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
   assert(!((intptr_t)dst & 3));
   assert(!(dst_stride & 3));
   assert(y_step_q4 == 16);
 
+  (void)x0_q4;
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)filter_x;
 
   src -= 3 * src_stride;
 
@@ -782,7 +686,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int height;
     const uint8_t *s;
     uint8_t *d;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    uint8x8_t t0, t1, t2, t3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
       __builtin_prefetch(src + 0 * src_stride);
@@ -828,22 +733,22 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                          filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                          filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                          filter4);
 
-        vst1_u8(d, vqrshrun_n_s16(d0, 7));
+        vst1_u8(d, t0);
         d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d1, 7));
+        vst1_u8(d, t1);
         d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d2, 7));
+        vst1_u8(d, t2);
         d += dst_stride;
-        vst1_u8(d, vqrshrun_n_s16(d3, 7));
+        vst1_u8(d, t3);
         d += dst_stride;
 
         s0 = s4;
@@ -864,19 +769,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x,  // unused
-                                 int x_step_q4,            // unused
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
-  const int16x8_t filters = vld1q_s16(filter_y);
+  const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
   assert(!((intptr_t)dst & 3));
   assert(!(dst_stride & 3));
   assert(y_step_q4 == 16);
 
+  (void)x0_q4;
   (void)x_step_q4;
   (void)y_step_q4;
-  (void)filter_x;
 
   src -= 3 * src_stride;
 
@@ -963,8 +867,9 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int height;
     const uint8_t *s;
     uint8_t *d;
+    uint8x8_t t0, t1, t2, t3;
     uint8x16_t d01, d23, dd01, dd23;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
       __builtin_prefetch(src + 0 * src_stride);
@@ -1010,17 +915,17 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
                          filter4);
-        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
                          filter4);
-        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
                          filter4);
-        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
                          filter4);
 
-        d01 = vcombine_u8(vqrshrun_n_s16(d0, 7), vqrshrun_n_s16(d1, 7));
-        d23 = vcombine_u8(vqrshrun_n_s16(d2, 7), vqrshrun_n_s16(d3, 7));
+        d01 = vcombine_u8(t0, t1);
+        d23 = vcombine_u8(t2, t3);
         dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
                            vld1_u8(d + 1 * dst_stride));
         dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
new file mode 100644
index 000000000..c1634ed55
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -0,0 +1,133 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filters,
+                                    const int16x4_t filter3,
+                                    const int16x4_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filters_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
+  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filters,
+                                    const int16x8_t filter3,
+                                    const int16x8_t filter4) {
+  const int16x4_t filters_lo = vget_low_s16(filters);
+  const int16x4_t filters_hi = vget_high_s16(filters);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filters_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filters_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filters_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
+  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
+                                       const int16x8_t filters) {
+  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
+  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
+  int16x8_t ss[8];
+
+  ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+  ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+  ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+  ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+  ss[4] = vreinterpretq_s16_u16(vmovl_u8(s[4]));
+  ss[5] = vreinterpretq_s16_u16(vmovl_u8(s[5]));
+  ss[6] = vreinterpretq_s16_u16(vmovl_u8(s[6]));
+  ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
+
+  return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
+                     filters, filter3, filter4);
+}
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
index 2d0f2ae06..5eee15664 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.asm
@@ -42,10 +42,11 @@
 ; r1    int src_stride
 ; r2    uint8_t *dst
 ; r3    int dst_stride
-; sp[]const int16_t *filter_x
-; sp[]int x_step_q4
-; sp[]const int16_t *filter_y ; unused
-; sp[]int y_step_q4           ; unused
+; sp[]const int16_t *filter
+; sp[]int x0_q4
+; sp[]int x_step_q4 ; unused
+; sp[]int y0_q4
+; sp[]int y_step_q4 ; unused
 ; sp[]int w
 ; sp[]int h
 
@@ -54,11 +55,11 @@
 
     sub             r0, r0, #3              ; adjust for taps
 
-    ldr             r5, [sp, #32]           ; filter_x
-    ldr             r6, [sp, #48]           ; w
-    ldr             r7, [sp, #52]           ; h
+    ldrd            r4, r5, [sp, #32]       ; filter, x0_q4
+    add             r4, r5, lsl #4
+    ldrd            r6, r7, [sp, #52]       ; w, h
 
-    vld1.s16        {q0}, [r5]              ; filter_x
+    vld1.s16        {q0}, [r4]              ; filter
 
     sub             r8, r1, r1, lsl #2      ; -src_stride * 3
     add             r8, r8, #4              ; -src_stride * 3 + 4
@@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz
 
     pld             [r5, r1, lsl #1]
 
-    ; src[] * filter_x
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d20, d22, d18, d19, d23, d24
     MULTIPLY_BY_Q0  q2,  d17, d20, d22, d18, d19, d23, d24, d26
     MULTIPLY_BY_Q0  q14, d20, d22, d18, d19, d23, d24, d26, d27
@@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz
     sub             r0, r0, r1
     sub             r0, r0, r1, lsl #1
 
-    ldr             r4, [sp, #32]           ; filter_y
-    ldr             r6, [sp, #40]           ; w
-    ldr             lr, [sp, #44]           ; h
+    ldr             r4, [sp, #24]           ; filter
+    ldr             r5, [sp, #36]           ; y0_q4
+    add             r4, r5, lsl #4
+    ldr             r6, [sp, #44]           ; w
+    ldr             lr, [sp, #48]           ; h
 
-    vld1.s16        {q0}, [r4]              ; filter_y
+    vld1.s16        {q0}, [r4]              ; filter
 
     lsl             r1, r1, #1
     lsl             r3, r3, #1
@@ -216,7 +219,7 @@ vpx_convolve8_loop_vert
     pld             [r5]
     pld             [r8]
 
-    ; src[] * filter_y
+    ; src[] * filter
     MULTIPLY_BY_Q0  q1,  d16, d17, d18, d19, d20, d21, d22, d24
 
     pld             [r5, r3]
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
index 04cb835fa..07349d03a 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon.c
@@ -15,13 +15,13 @@
 
 void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int filter_x_stride,
-                           const int16_t *filter_y, int filter_y_stride, int w,
-                           int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   if (w < 8) {  // avg4
     uint8x8_t s0, s1;
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
index 97e6189fd..efd6574f1 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_avg_neon| PROC
     push                {r4-r6, lr}
-    ldrd                r4, r5, [sp, #32]
+    ldrd                r4, r5, [sp, #36]
     mov                 r6, r2
 
     cmp                 r4, #32
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
index a8f690acd..7abed67a4 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -15,13 +15,14 @@
 
 void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   if (w < 8) {  // copy4
     do {
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
index 89164ad48..7a66e3ce2 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm
@@ -17,7 +17,7 @@
 
 |vpx_convolve_copy_neon| PROC
     push                {r4-r5, lr}
-    ldrd                r4, r5, [sp, #28]
+    ldrd                r4, r5, [sp, #32]
 
     cmp                 r4, #32
     bgt                 copy64
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
index 6ca0e501b..2bf2d890b 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve_neon.c
@@ -15,13 +15,13 @@
 #include "vpx_ports/mem.h"
 
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
   /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
    * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
    */
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  uint8_t temp[64 * 72];
 
   // Account for the vertical phase needing 3 lines prior and 4 lines post
   const int intermediate_height = h + 7;
@@ -33,21 +33,21 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
    * height and filter a multiple of 4 lines. Since this goes in to the temp
    * buffer which has lots of extra room and is subsequently discarded this is
    * safe if somewhat less than ideal.   */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
 
   /* Step into the temp buffer 3 lines to get the actual frame data */
-  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,
-                          filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
+  uint8_t temp[64 * 72];
   const int intermediate_height = h + 7;
 
   assert(y_step_q4 == 16);
@@ -56,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   /* This implementation has the same issues as above. In addition, we only want
    * to average the values after both passes.
    */
-  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w,
+  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                           x0_q4, x_step_q4, y0_q4, y_step_q4, w,
                            intermediate_height);
-  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
 }
diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
new file mode 100644
index 000000000..8edf8a66e
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -0,0 +1,324 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void scaledconvolve_horiz_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  y = h;
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+          uint8x8_t s[8], d;
+          int16x8_t ss[4];
+          int16x4_t t[8], tt;
+
+          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
+          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);
+
+          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
+          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
+          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
+          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
+          t[0] = vget_low_s16(ss[0]);
+          t[1] = vget_low_s16(ss[1]);
+          t[2] = vget_low_s16(ss[2]);
+          t[3] = vget_low_s16(ss[3]);
+          t[4] = vget_high_s16(ss[0]);
+          t[5] = vget_high_s16(ss[1]);
+          t[6] = vget_high_s16(ss[2]);
+          t[7] = vget_high_s16(ss[3]);
+
+          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
+                           filters, filter3, filter4);
+          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      {
+        const uint8x8x4_t d4 = vld4_u8(temp);
+        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[0]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[1]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[2]), 0);
+        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
+                      vreinterpret_u32_u8(d4.val[3]), 0);
+      }
+      x += 4;
+    } while (x < w);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    y -= 4;
+  } while (y > 0);
+}
+
+static INLINE void scaledconvolve_horiz_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
+    const int x0_q4, const int x_step_q4, const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = (h + 7) & ~7;
+
+  do {
+    int x_q4 = x0_q4;
+    x = 0;
+    do {
+      uint8x8_t d[8];
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+
+        if (x_q4 & SUBPEL_MASK) {
+          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
+          uint8x8_t s[8];
+          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
+                      &s[5], &s[6], &s[7]);
+          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
+                           &s[7]);
+          d[0] = scale_filter_8(s, filters);
+          vst1_u8(&temp[8 * z], d[0]);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                  &d[7]);
+      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
+      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
+      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
+      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
+      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
+      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
+      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
+      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
+      x += 8;
+    } while (x < w);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static INLINE void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
+      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
+      uint8x8_t s[8], d;
+      int16x4_t t[8], tt;
+
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      t[0] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[0])));
+      t[1] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[1])));
+      t[2] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[2])));
+      t[3] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[3])));
+      t[4] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[4])));
+      t[5] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[5])));
+      t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
+      t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
+
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
+                       filter3, filter4);
+      d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+      uint8x8_t s[8], d;
+      load_u8_8x8(src_y, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5],
+                  &s[6], &s[7]);
+      d = scale_filter_8(s, filters);
+      vst1_u8(dst, d);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+static INLINE void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  y = h;
+  do {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    if (y_q4 & SUBPEL_MASK) {
+      x = 0;
+      do {
+        const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
+        uint8x16_t ss[8];
+        uint8x8_t s[8], d[2];
+        load_u8_16x8(src_y, src_stride, &ss[0], &ss[1], &ss[2], &ss[3], &ss[4],
+                     &ss[5], &ss[6], &ss[7]);
+        s[0] = vget_low_u8(ss[0]);
+        s[1] = vget_low_u8(ss[1]);
+        s[2] = vget_low_u8(ss[2]);
+        s[3] = vget_low_u8(ss[3]);
+        s[4] = vget_low_u8(ss[4]);
+        s[5] = vget_low_u8(ss[5]);
+        s[6] = vget_low_u8(ss[6]);
+        s[7] = vget_low_u8(ss[7]);
+        d[0] = scale_filter_8(s, filters);
+
+        s[0] = vget_high_u8(ss[0]);
+        s[1] = vget_high_u8(ss[1]);
+        s[2] = vget_high_u8(ss[2]);
+        s[3] = vget_high_u8(ss[3]);
+        s[4] = vget_high_u8(ss[4]);
+        s[5] = vget_high_u8(ss[5]);
+        s[6] = vget_high_u8(ss[6]);
+        s[7] = vget_high_u8(ss[7]);
+        d[1] = scale_filter_8(s, filters);
+        vst1q_u8(&dst[x], vcombine_u8(d[0], d[1]));
+        src_y += 16;
+        x += 16;
+      } while (x < w);
+    } else {
+      memcpy(dst, &src_y[3 * src_stride], w);
+    }
+    dst += dst_stride;
+    y_q4 += y_step_q4;
+  } while (--y);
+}
+
+void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c
index e4cd6cca7..a7ac6d953 100644
--- a/libvpx/vpx_dsp/avg.c
+++ b/libvpx/vpx_dsp/avg.c
@@ -34,7 +34,7 @@ unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
 //           second pass, 12 bit, dynamic range [-2040, 2040]
-static void hadamard_col8(const int16_t *src_diff, int src_stride,
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
                           int16_t *coeff) {
   int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
   int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
@@ -66,7 +66,7 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
 
 // The order of the output coeff of the hadamard is not important. For
 // optimization purposes the final transpose may be skipped.
-void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
                         tran_low_t *coeff) {
   int idx;
   int16_t buffer[64];
@@ -92,7 +92,7 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
 }
 
 // In place 16x16 2D Hadamard transform
-void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
                           tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
diff --git a/libvpx/vpx_dsp/deblock.c b/libvpx/vpx_dsp/deblock.c
index 3734ac251..235e85793 100644
--- a/libvpx/vpx_dsp/deblock.c
+++ b/libvpx/vpx_dsp/deblock.c
@@ -9,6 +9,7 @@
  */
 #include <assert.h>
 #include <stdlib.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
 DECLARE_PROTECTED(const int16_t vpx_rv[]) = {
diff --git a/libvpx/vpx_dsp/fwd_txfm.c b/libvpx/vpx_dsp/fwd_txfm.c
index aa5960109..6dcb3ba66 100644
--- a/libvpx/vpx_dsp/fwd_txfm.c
+++ b/libvpx/vpx_dsp/fwd_txfm.c
@@ -84,7 +84,7 @@ void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
   for (r = 0; r < 4; ++r)
     for (c = 0; c < 4; ++c) sum += input[r * stride + c];
 
-  output[0] = sum << 1;
+  output[0] = sum * 2;
 }
 
 void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
diff --git a/libvpx/vpx_dsp/intrapred.c b/libvpx/vpx_dsp/intrapred.c
index 9e2048ebf..400e632e9 100644
--- a/libvpx/vpx_dsp/intrapred.c
+++ b/libvpx/vpx_dsp/intrapred.c
@@ -489,30 +489,39 @@ static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
   int r, c;
+  int size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1],
-                            above[(r >> 1) + c + 2])
-                     : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]);
-    }
-    dst += stride;
+  for (c = 0; c < bs; ++c) {
+    dst[c] = AVG2(above[c], above[c + 1]);
+    dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]);
+  }
+  for (r = 2, size = bs - 2; r < bs; r += 2, --size) {
+    memcpy(dst + (r + 0) * stride, dst + (r >> 1), size * sizeof(*dst));
+    vpx_memset16(dst + (r + 0) * stride + size, above[bs - 1], bs - size);
+    memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1),
+           size * sizeof(*dst));
+    vpx_memset16(dst + (r + 1) * stride + size, above[bs - 1], bs - size);
   }
 }
 
 static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  int r, c;
+  const uint16_t above_right = above[bs - 1];
+  const uint16_t *const dst_row0 = dst;
+  int x, size;
   (void)left;
   (void)bd;
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      dst[c] = r + c + 2 < bs * 2
-                   ? AVG3(above[r + c], above[r + c + 1], above[r + c + 2])
-                   : above[bs * 2 - 1];
-    }
+
+  for (x = 0; x < bs - 1; ++x) {
+    dst[x] = AVG3(above[x], above[x + 1], above[x + 2]);
+  }
+  dst[bs - 1] = above_right;
+  dst += stride;
+  for (x = 1, size = bs - 2; x < bs; ++x, --size) {
+    memcpy(dst, dst_row0 + x, size * sizeof(*dst));
+    vpx_memset16(dst + size, above_right, x + 1);
     dst += stride;
   }
 }
diff --git a/libvpx/vpx_dsp/inv_txfm.c b/libvpx/vpx_dsp/inv_txfm.c
index 29323d1b8..0194aa1e1 100644
--- a/libvpx/vpx_dsp/inv_txfm.c
+++ b/libvpx/vpx_dsp/inv_txfm.c
@@ -105,6 +105,7 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
     return;
   }
 
+  // 32-bit result is enough for the following multiplications.
   s0 = sinpi_1_9 * x0;
   s1 = sinpi_2_9 * x0;
   s2 = sinpi_3_9 * x1;
@@ -130,16 +131,16 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void idct4_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step[4];
+  int16_t step[4];
   tran_high_t temp1, temp2;
 
   // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
+  temp1 = ((int16_t)input[0] + (int16_t)input[2]) * cospi_16_64;
+  temp2 = ((int16_t)input[0] - (int16_t)input[2]) * cospi_16_64;
   step[0] = WRAPLOW(dct_const_round_shift(temp1));
   step[1] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  temp1 = (int16_t)input[1] * cospi_24_64 - (int16_t)input[3] * cospi_8_64;
+  temp2 = (int16_t)input[1] * cospi_8_64 + (int16_t)input[3] * cospi_24_64;
   step[2] = WRAPLOW(dct_const_round_shift(temp1));
   step[3] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -177,7 +178,8 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 4);
@@ -267,20 +269,20 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void idct8_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[8], step2[8];
+  int16_t step1[8], step2[8];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0];
-  step1[2] = input[4];
-  step1[1] = input[2];
-  step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[0] = (int16_t)input[0];
+  step1[2] = (int16_t)input[4];
+  step1[1] = (int16_t)input[2];
+  step1[3] = (int16_t)input[6];
+  temp1 = (int16_t)input[1] * cospi_28_64 - (int16_t)input[7] * cospi_4_64;
+  temp2 = (int16_t)input[1] * cospi_4_64 + (int16_t)input[7] * cospi_28_64;
   step1[4] = WRAPLOW(dct_const_round_shift(temp1));
   step1[7] = WRAPLOW(dct_const_round_shift(temp2));
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  temp1 = (int16_t)input[5] * cospi_12_64 - (int16_t)input[3] * cospi_20_64;
+  temp2 = (int16_t)input[5] * cospi_20_64 + (int16_t)input[3] * cospi_12_64;
   step1[5] = WRAPLOW(dct_const_round_shift(temp1));
   step1[6] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -373,7 +375,8 @@ void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 5);
@@ -552,26 +555,26 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) {
 }
 
 void idct16_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[16], step2[16];
+  int16_t step1[16], step2[16];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0 / 2];
-  step1[1] = input[16 / 2];
-  step1[2] = input[8 / 2];
-  step1[3] = input[24 / 2];
-  step1[4] = input[4 / 2];
-  step1[5] = input[20 / 2];
-  step1[6] = input[12 / 2];
-  step1[7] = input[28 / 2];
-  step1[8] = input[2 / 2];
-  step1[9] = input[18 / 2];
-  step1[10] = input[10 / 2];
-  step1[11] = input[26 / 2];
-  step1[12] = input[6 / 2];
-  step1[13] = input[22 / 2];
-  step1[14] = input[14 / 2];
-  step1[15] = input[30 / 2];
+  step1[0] = (int16_t)input[0 / 2];
+  step1[1] = (int16_t)input[16 / 2];
+  step1[2] = (int16_t)input[8 / 2];
+  step1[3] = (int16_t)input[24 / 2];
+  step1[4] = (int16_t)input[4 / 2];
+  step1[5] = (int16_t)input[20 / 2];
+  step1[6] = (int16_t)input[12 / 2];
+  step1[7] = (int16_t)input[28 / 2];
+  step1[8] = (int16_t)input[2 / 2];
+  step1[9] = (int16_t)input[18 / 2];
+  step1[10] = (int16_t)input[10 / 2];
+  step1[11] = (int16_t)input[26 / 2];
+  step1[12] = (int16_t)input[6 / 2];
+  step1[13] = (int16_t)input[22 / 2];
+  step1[14] = (int16_t)input[14 / 2];
+  step1[15] = (int16_t)input[30 / 2];
 
   // stage 2
   step2[0] = step1[0];
@@ -796,7 +799,8 @@ void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -807,64 +811,64 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
 }
 
 void idct32_c(const tran_low_t *input, tran_low_t *output) {
-  tran_low_t step1[32], step2[32];
+  int16_t step1[32], step2[32];
   tran_high_t temp1, temp2;
 
   // stage 1
-  step1[0] = input[0];
-  step1[1] = input[16];
-  step1[2] = input[8];
-  step1[3] = input[24];
-  step1[4] = input[4];
-  step1[5] = input[20];
-  step1[6] = input[12];
-  step1[7] = input[28];
-  step1[8] = input[2];
-  step1[9] = input[18];
-  step1[10] = input[10];
-  step1[11] = input[26];
-  step1[12] = input[6];
-  step1[13] = input[22];
-  step1[14] = input[14];
-  step1[15] = input[30];
-
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[0] = (int16_t)input[0];
+  step1[1] = (int16_t)input[16];
+  step1[2] = (int16_t)input[8];
+  step1[3] = (int16_t)input[24];
+  step1[4] = (int16_t)input[4];
+  step1[5] = (int16_t)input[20];
+  step1[6] = (int16_t)input[12];
+  step1[7] = (int16_t)input[28];
+  step1[8] = (int16_t)input[2];
+  step1[9] = (int16_t)input[18];
+  step1[10] = (int16_t)input[10];
+  step1[11] = (int16_t)input[26];
+  step1[12] = (int16_t)input[6];
+  step1[13] = (int16_t)input[22];
+  step1[14] = (int16_t)input[14];
+  step1[15] = (int16_t)input[30];
+
+  temp1 = (int16_t)input[1] * cospi_31_64 - (int16_t)input[31] * cospi_1_64;
+  temp2 = (int16_t)input[1] * cospi_1_64 + (int16_t)input[31] * cospi_31_64;
   step1[16] = WRAPLOW(dct_const_round_shift(temp1));
   step1[31] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  temp1 = (int16_t)input[17] * cospi_15_64 - (int16_t)input[15] * cospi_17_64;
+  temp2 = (int16_t)input[17] * cospi_17_64 + (int16_t)input[15] * cospi_15_64;
   step1[17] = WRAPLOW(dct_const_round_shift(temp1));
   step1[30] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  temp1 = (int16_t)input[9] * cospi_23_64 - (int16_t)input[23] * cospi_9_64;
+  temp2 = (int16_t)input[9] * cospi_9_64 + (int16_t)input[23] * cospi_23_64;
   step1[18] = WRAPLOW(dct_const_round_shift(temp1));
   step1[29] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  temp1 = (int16_t)input[25] * cospi_7_64 - (int16_t)input[7] * cospi_25_64;
+  temp2 = (int16_t)input[25] * cospi_25_64 + (int16_t)input[7] * cospi_7_64;
   step1[19] = WRAPLOW(dct_const_round_shift(temp1));
   step1[28] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  temp1 = (int16_t)input[5] * cospi_27_64 - (int16_t)input[27] * cospi_5_64;
+  temp2 = (int16_t)input[5] * cospi_5_64 + (int16_t)input[27] * cospi_27_64;
   step1[20] = WRAPLOW(dct_const_round_shift(temp1));
   step1[27] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  temp1 = (int16_t)input[21] * cospi_11_64 - (int16_t)input[11] * cospi_21_64;
+  temp2 = (int16_t)input[21] * cospi_21_64 + (int16_t)input[11] * cospi_11_64;
   step1[21] = WRAPLOW(dct_const_round_shift(temp1));
   step1[26] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  temp1 = (int16_t)input[13] * cospi_19_64 - (int16_t)input[19] * cospi_13_64;
+  temp2 = (int16_t)input[13] * cospi_13_64 + (int16_t)input[19] * cospi_19_64;
   step1[22] = WRAPLOW(dct_const_round_shift(temp1));
   step1[25] = WRAPLOW(dct_const_round_shift(temp2));
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  temp1 = (int16_t)input[29] * cospi_3_64 - (int16_t)input[3] * cospi_29_64;
+  temp2 = (int16_t)input[29] * cospi_29_64 + (int16_t)input[3] * cospi_3_64;
   step1[23] = WRAPLOW(dct_const_round_shift(temp1));
   step1[24] = WRAPLOW(dct_const_round_shift(temp2));
 
@@ -1259,7 +1263,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
   out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
   a1 = ROUND_POWER_OF_TWO(out, 6);
@@ -1390,13 +1395,13 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
     return;
   }
 
-  s0 = sinpi_1_9 * x0;
-  s1 = sinpi_2_9 * x0;
-  s2 = sinpi_3_9 * x1;
-  s3 = sinpi_4_9 * x2;
-  s4 = sinpi_1_9 * x2;
-  s5 = sinpi_2_9 * x3;
-  s6 = sinpi_4_9 * x3;
+  s0 = (tran_high_t)sinpi_1_9 * x0;
+  s1 = (tran_high_t)sinpi_2_9 * x0;
+  s2 = (tran_high_t)sinpi_3_9 * x1;
+  s3 = (tran_high_t)sinpi_4_9 * x2;
+  s4 = (tran_high_t)sinpi_1_9 * x2;
+  s5 = (tran_high_t)sinpi_2_9 * x3;
+  s6 = (tran_high_t)sinpi_4_9 * x3;
   s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
 
   s0 = s0 + s3 + s5;
@@ -1428,12 +1433,14 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
   }
 
   // stage 1
-  temp1 = (input[0] + input[2]) * cospi_16_64;
-  temp2 = (input[0] - input[2]) * cospi_16_64;
+  temp1 = (input[0] + input[2]) * (tran_high_t)cospi_16_64;
+  temp2 = (input[0] - input[2]) * (tran_high_t)cospi_16_64;
   step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
-  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  temp1 =
+      input[1] * (tran_high_t)cospi_24_64 - input[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_8_64 + input[3] * (tran_high_t)cospi_24_64;
   step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1473,10 +1480,11 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
@@ -1514,14 +1522,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   }
 
   // stage 1
-  s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  s0 = (tran_high_t)cospi_2_64 * x0 + (tran_high_t)cospi_30_64 * x1;
+  s1 = (tran_high_t)cospi_30_64 * x0 - (tran_high_t)cospi_2_64 * x1;
+  s2 = (tran_high_t)cospi_10_64 * x2 + (tran_high_t)cospi_22_64 * x3;
+  s3 = (tran_high_t)cospi_22_64 * x2 - (tran_high_t)cospi_10_64 * x3;
+  s4 = (tran_high_t)cospi_18_64 * x4 + (tran_high_t)cospi_14_64 * x5;
+  s5 = (tran_high_t)cospi_14_64 * x4 - (tran_high_t)cospi_18_64 * x5;
+  s6 = (tran_high_t)cospi_26_64 * x6 + (tran_high_t)cospi_6_64 * x7;
+  s7 = (tran_high_t)cospi_6_64 * x6 - (tran_high_t)cospi_26_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
@@ -1537,10 +1545,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
-  s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
-  s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
-  s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  s4 = (tran_high_t)cospi_8_64 * x4 + (tran_high_t)cospi_24_64 * x5;
+  s5 = (tran_high_t)cospi_24_64 * x4 - (tran_high_t)cospi_8_64 * x5;
+  s6 = (tran_high_t)(-cospi_24_64) * x6 + (tran_high_t)cospi_8_64 * x7;
+  s7 = (tran_high_t)cospi_8_64 * x6 + (tran_high_t)cospi_24_64 * x7;
 
   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1552,10 +1560,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
 
   // stage 3
-  s2 = cospi_16_64 * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (x6 - x7);
+  s2 = (tran_high_t)cospi_16_64 * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (x6 - x7);
 
   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
@@ -1589,12 +1597,16 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = input[4];
   step1[1] = input[2];
   step1[3] = input[6];
-  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
-  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  temp1 =
+      input[1] * (tran_high_t)cospi_28_64 - input[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_4_64 + input[7] * (tran_high_t)cospi_28_64;
   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
-  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  temp1 =
+      input[5] * (tran_high_t)cospi_12_64 - input[3] * (tran_high_t)cospi_20_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_20_64 + input[3] * (tran_high_t)cospi_12_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1609,8 +1621,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   // stage 3 - odd half
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
@@ -1681,10 +1693,11 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint16_t *dest,
                                 int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 5);
   for (j = 0; j < 8; ++j) {
     for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -1728,22 +1741,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   }
 
   // stage 1
-  s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
-  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
-  s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
-  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
-  s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
-  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
-  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
-  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
-  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
-  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
-  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
-  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
-  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
-  s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
-  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
-  s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
+  s0 = x0 * (tran_high_t)cospi_1_64 + x1 * (tran_high_t)cospi_31_64;
+  s1 = x0 * (tran_high_t)cospi_31_64 - x1 * (tran_high_t)cospi_1_64;
+  s2 = x2 * (tran_high_t)cospi_5_64 + x3 * (tran_high_t)cospi_27_64;
+  s3 = x2 * (tran_high_t)cospi_27_64 - x3 * (tran_high_t)cospi_5_64;
+  s4 = x4 * (tran_high_t)cospi_9_64 + x5 * (tran_high_t)cospi_23_64;
+  s5 = x4 * (tran_high_t)cospi_23_64 - x5 * (tran_high_t)cospi_9_64;
+  s6 = x6 * (tran_high_t)cospi_13_64 + x7 * (tran_high_t)cospi_19_64;
+  s7 = x6 * (tran_high_t)cospi_19_64 - x7 * (tran_high_t)cospi_13_64;
+  s8 = x8 * (tran_high_t)cospi_17_64 + x9 * (tran_high_t)cospi_15_64;
+  s9 = x8 * (tran_high_t)cospi_15_64 - x9 * (tran_high_t)cospi_17_64;
+  s10 = x10 * (tran_high_t)cospi_21_64 + x11 * (tran_high_t)cospi_11_64;
+  s11 = x10 * (tran_high_t)cospi_11_64 - x11 * (tran_high_t)cospi_21_64;
+  s12 = x12 * (tran_high_t)cospi_25_64 + x13 * (tran_high_t)cospi_7_64;
+  s13 = x12 * (tran_high_t)cospi_7_64 - x13 * (tran_high_t)cospi_25_64;
+  s14 = x14 * (tran_high_t)cospi_29_64 + x15 * (tran_high_t)cospi_3_64;
+  s15 = x14 * (tran_high_t)cospi_3_64 - x15 * (tran_high_t)cospi_29_64;
 
   x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
   x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
@@ -1771,14 +1784,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s5 = x5;
   s6 = x6;
   s7 = x7;
-  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  s8 = x8 * (tran_high_t)cospi_4_64 + x9 * (tran_high_t)cospi_28_64;
+  s9 = x8 * (tran_high_t)cospi_28_64 - x9 * (tran_high_t)cospi_4_64;
+  s10 = x10 * (tran_high_t)cospi_20_64 + x11 * (tran_high_t)cospi_12_64;
+  s11 = x10 * (tran_high_t)cospi_12_64 - x11 * (tran_high_t)cospi_20_64;
+  s12 = -x12 * (tran_high_t)cospi_28_64 + x13 * (tran_high_t)cospi_4_64;
+  s13 = x12 * (tran_high_t)cospi_4_64 + x13 * (tran_high_t)cospi_28_64;
+  s14 = -x14 * (tran_high_t)cospi_12_64 + x15 * (tran_high_t)cospi_20_64;
+  s15 = x14 * (tran_high_t)cospi_20_64 + x15 * (tran_high_t)cospi_12_64;
 
   x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
@@ -1802,18 +1815,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   s1 = x1;
   s2 = x2;
   s3 = x3;
-  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s4 = x4 * (tran_high_t)cospi_8_64 + x5 * (tran_high_t)cospi_24_64;
+  s5 = x4 * (tran_high_t)cospi_24_64 - x5 * (tran_high_t)cospi_8_64;
+  s6 = -x6 * (tran_high_t)cospi_24_64 + x7 * (tran_high_t)cospi_8_64;
+  s7 = x6 * (tran_high_t)cospi_8_64 + x7 * (tran_high_t)cospi_24_64;
   s8 = x8;
   s9 = x9;
   s10 = x10;
   s11 = x11;
-  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  s12 = x12 * (tran_high_t)cospi_8_64 + x13 * (tran_high_t)cospi_24_64;
+  s13 = x12 * (tran_high_t)cospi_24_64 - x13 * (tran_high_t)cospi_8_64;
+  s14 = -x14 * (tran_high_t)cospi_24_64 + x15 * (tran_high_t)cospi_8_64;
+  s15 = x14 * (tran_high_t)cospi_8_64 + x15 * (tran_high_t)cospi_24_64;
 
   x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
   x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1833,14 +1846,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
 
   // stage 4
-  s2 = (-cospi_16_64) * (x2 + x3);
-  s3 = cospi_16_64 * (x2 - x3);
-  s6 = cospi_16_64 * (x6 + x7);
-  s7 = cospi_16_64 * (-x6 + x7);
-  s10 = cospi_16_64 * (x10 + x11);
-  s11 = cospi_16_64 * (-x10 + x11);
-  s14 = (-cospi_16_64) * (x14 + x15);
-  s15 = cospi_16_64 * (x14 - x15);
+  s2 = (tran_high_t)(-cospi_16_64) * (x2 + x3);
+  s3 = (tran_high_t)cospi_16_64 * (x2 - x3);
+  s6 = (tran_high_t)cospi_16_64 * (x6 + x7);
+  s7 = (tran_high_t)cospi_16_64 * (-x6 + x7);
+  s10 = (tran_high_t)cospi_16_64 * (x10 + x11);
+  s11 = (tran_high_t)cospi_16_64 * (-x10 + x11);
+  s14 = (tran_high_t)(-cospi_16_64) * (x14 + x15);
+  s15 = (tran_high_t)cospi_16_64 * (x14 - x15);
 
   x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
   x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
@@ -1910,23 +1923,31 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1936,12 +1957,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -1955,12 +1980,14 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -1970,12 +1997,16 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
@@ -1987,8 +2018,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
@@ -2013,12 +2044,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
@@ -2126,10 +2157,11 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   tran_high_t a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
   for (j = 0; j < 16; ++j) {
     for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
@@ -2169,43 +2201,59 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[14] = input[14];
   step1[15] = input[30];
 
-  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
-  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  temp1 =
+      input[1] * (tran_high_t)cospi_31_64 - input[31] * (tran_high_t)cospi_1_64;
+  temp2 =
+      input[1] * (tran_high_t)cospi_1_64 + input[31] * (tran_high_t)cospi_31_64;
   step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
-  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  temp1 = input[17] * (tran_high_t)cospi_15_64 -
+          input[15] * (tran_high_t)cospi_17_64;
+  temp2 = input[17] * (tran_high_t)cospi_17_64 +
+          input[15] * (tran_high_t)cospi_15_64;
   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
-  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  temp1 =
+      input[9] * (tran_high_t)cospi_23_64 - input[23] * (tran_high_t)cospi_9_64;
+  temp2 =
+      input[9] * (tran_high_t)cospi_9_64 + input[23] * (tran_high_t)cospi_23_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
-  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  temp1 =
+      input[25] * (tran_high_t)cospi_7_64 - input[7] * (tran_high_t)cospi_25_64;
+  temp2 =
+      input[25] * (tran_high_t)cospi_25_64 + input[7] * (tran_high_t)cospi_7_64;
   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
-  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  temp1 =
+      input[5] * (tran_high_t)cospi_27_64 - input[27] * (tran_high_t)cospi_5_64;
+  temp2 =
+      input[5] * (tran_high_t)cospi_5_64 + input[27] * (tran_high_t)cospi_27_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
-  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  temp1 = input[21] * (tran_high_t)cospi_11_64 -
+          input[11] * (tran_high_t)cospi_21_64;
+  temp2 = input[21] * (tran_high_t)cospi_21_64 +
+          input[11] * (tran_high_t)cospi_11_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
-  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  temp1 = input[13] * (tran_high_t)cospi_19_64 -
+          input[19] * (tran_high_t)cospi_13_64;
+  temp2 = input[13] * (tran_high_t)cospi_13_64 +
+          input[19] * (tran_high_t)cospi_19_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
-  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  temp1 =
+      input[29] * (tran_high_t)cospi_3_64 - input[3] * (tran_high_t)cospi_29_64;
+  temp2 =
+      input[29] * (tran_high_t)cospi_29_64 + input[3] * (tran_high_t)cospi_3_64;
   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2219,23 +2267,31 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[6] = step1[6];
   step2[7] = step1[7];
 
-  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
-  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  temp1 =
+      step1[8] * (tran_high_t)cospi_30_64 - step1[15] * (tran_high_t)cospi_2_64;
+  temp2 =
+      step1[8] * (tran_high_t)cospi_2_64 + step1[15] * (tran_high_t)cospi_30_64;
   step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
-  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  temp1 = step1[9] * (tran_high_t)cospi_14_64 -
+          step1[14] * (tran_high_t)cospi_18_64;
+  temp2 = step1[9] * (tran_high_t)cospi_18_64 +
+          step1[14] * (tran_high_t)cospi_14_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
-  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  temp1 = step1[10] * (tran_high_t)cospi_22_64 -
+          step1[13] * (tran_high_t)cospi_10_64;
+  temp2 = step1[10] * (tran_high_t)cospi_10_64 +
+          step1[13] * (tran_high_t)cospi_22_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
-  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
-  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  temp1 = step1[11] * (tran_high_t)cospi_6_64 -
+          step1[12] * (tran_high_t)cospi_26_64;
+  temp2 = step1[11] * (tran_high_t)cospi_26_64 +
+          step1[12] * (tran_high_t)cospi_6_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2262,12 +2318,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = step2[2];
   step1[3] = step2[3];
 
-  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
-  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  temp1 =
+      step2[4] * (tran_high_t)cospi_28_64 - step2[7] * (tran_high_t)cospi_4_64;
+  temp2 =
+      step2[4] * (tran_high_t)cospi_4_64 + step2[7] * (tran_high_t)cospi_28_64;
   step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
-  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  temp1 =
+      step2[5] * (tran_high_t)cospi_12_64 - step2[6] * (tran_high_t)cospi_20_64;
+  temp2 =
+      step2[5] * (tran_high_t)cospi_20_64 + step2[6] * (tran_high_t)cospi_12_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
 
@@ -2282,22 +2342,30 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[31] = step2[31];
-  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
-  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  temp1 = -step2[17] * (tran_high_t)cospi_4_64 +
+          step2[30] * (tran_high_t)cospi_28_64;
+  temp2 = step2[17] * (tran_high_t)cospi_28_64 +
+          step2[30] * (tran_high_t)cospi_4_64;
   step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
-  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  temp1 = -step2[18] * (tran_high_t)cospi_28_64 -
+          step2[29] * (tran_high_t)cospi_4_64;
+  temp2 = -step2[18] * (tran_high_t)cospi_4_64 +
+          step2[29] * (tran_high_t)cospi_28_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[19] = step2[19];
   step1[20] = step2[20];
-  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
-  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  temp1 = -step2[21] * (tran_high_t)cospi_20_64 +
+          step2[26] * (tran_high_t)cospi_12_64;
+  temp2 = step2[21] * (tran_high_t)cospi_12_64 +
+          step2[26] * (tran_high_t)cospi_20_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
-  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  temp1 = -step2[22] * (tran_high_t)cospi_12_64 -
+          step2[25] * (tran_high_t)cospi_20_64;
+  temp2 = -step2[22] * (tran_high_t)cospi_20_64 +
+          step2[25] * (tran_high_t)cospi_12_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[23] = step2[23];
@@ -2306,12 +2374,14 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[28] = step2[28];
 
   // stage 4
-  temp1 = (step1[0] + step1[1]) * cospi_16_64;
-  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  temp1 = (step1[0] + step1[1]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * (tran_high_t)cospi_16_64;
   step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
-  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  temp1 =
+      step1[2] * (tran_high_t)cospi_24_64 - step1[3] * (tran_high_t)cospi_8_64;
+  temp2 =
+      step1[2] * (tran_high_t)cospi_8_64 + step1[3] * (tran_high_t)cospi_24_64;
   step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
@@ -2321,12 +2391,16 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step2[8] = step1[8];
   step2[15] = step1[15];
-  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
-  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  temp1 = -step1[9] * (tran_high_t)cospi_8_64 +
+          step1[14] * (tran_high_t)cospi_24_64;
+  temp2 =
+      step1[9] * (tran_high_t)cospi_24_64 + step1[14] * (tran_high_t)cospi_8_64;
   step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
-  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  temp1 = -step1[10] * (tran_high_t)cospi_24_64 -
+          step1[13] * (tran_high_t)cospi_8_64;
+  temp2 = -step1[10] * (tran_high_t)cospi_8_64 +
+          step1[13] * (tran_high_t)cospi_24_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[11] = step1[11];
@@ -2356,8 +2430,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
   step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
   step1[4] = step2[4];
-  temp1 = (step2[6] - step2[5]) * cospi_16_64;
-  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  temp1 = (step2[6] - step2[5]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * (tran_high_t)cospi_16_64;
   step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[7] = step2[7];
@@ -2373,20 +2447,28 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
 
   step1[16] = step2[16];
   step1[17] = step2[17];
-  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
-  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  temp1 = -step2[18] * (tran_high_t)cospi_8_64 +
+          step2[29] * (tran_high_t)cospi_24_64;
+  temp2 = step2[18] * (tran_high_t)cospi_24_64 +
+          step2[29] * (tran_high_t)cospi_8_64;
   step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
-  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  temp1 = -step2[19] * (tran_high_t)cospi_8_64 +
+          step2[28] * (tran_high_t)cospi_24_64;
+  temp2 = step2[19] * (tran_high_t)cospi_24_64 +
+          step2[28] * (tran_high_t)cospi_8_64;
   step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
-  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  temp1 = -step2[20] * (tran_high_t)cospi_24_64 -
+          step2[27] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[20] * (tran_high_t)cospi_8_64 +
+          step2[27] * (tran_high_t)cospi_24_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
-  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  temp1 = -step2[21] * (tran_high_t)cospi_24_64 -
+          step2[26] * (tran_high_t)cospi_8_64;
+  temp2 = -step2[21] * (tran_high_t)cospi_8_64 +
+          step2[26] * (tran_high_t)cospi_24_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[22] = step2[22];
@@ -2407,12 +2489,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
   step2[8] = step1[8];
   step2[9] = step1[9];
-  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
-  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  temp1 = (-step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * (tran_high_t)cospi_16_64;
   step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
-  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  temp1 = (-step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * (tran_high_t)cospi_16_64;
   step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step2[14] = step1[14];
@@ -2458,20 +2540,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output,
   step1[17] = step2[17];
   step1[18] = step2[18];
   step1[19] = step2[19];
-  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
-  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  temp1 = (-step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * (tran_high_t)cospi_16_64;
   step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
-  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  temp1 = (-step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * (tran_high_t)cospi_16_64;
   step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
-  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  temp1 = (-step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * (tran_high_t)cospi_16_64;
   step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
-  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
-  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  temp1 = (-step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * (tran_high_t)cospi_16_64;
   step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
   step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
   step1[28] = step2[28];
@@ -2603,10 +2685,11 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint16_t *dest,
                                   int stride, int bd) {
   int i, j;
   int a1;
-  tran_low_t out =
-      HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+  tran_low_t out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 6);
 
   for (j = 0; j < 32; ++j) {
diff --git a/libvpx/vpx_dsp/mips/avg_msa.c b/libvpx/vpx_dsp/mips/avg_msa.c
index 48b841969..d0ac7b8e2 100644
--- a/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/libvpx/vpx_dsp/mips/avg_msa.c
@@ -56,7 +56,8 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
   return sum_out;
 }
 
-void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
+void vpx_hadamard_8x8_msa(const int16_t *src, ptrdiff_t src_stride,
+                          int16_t *dst) {
   v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
@@ -80,7 +81,8 @@ void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
   ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
 }
 
-void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) {
+void vpx_hadamard_16x16_msa(const int16_t *src, ptrdiff_t src_stride,
+                            int16_t *dst) {
   v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
   v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
index ae88eddfd..18e7d5375 100644
--- a/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_dspr2.c
@@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
       break;
     default:
-      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                               x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
index e944207b6..7dcb662d7 100644
--- a/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c
@@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                      h);
       break;
     default:
-      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                                x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
index 5cc06b5f2..9e65a8f50 100644
--- a/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_horiz_dspr2.c
@@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h) {
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   uint32_t pos = 38;
 
   assert(x_step_q4 == 16);
@@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                  (int32_t)dst_stride, filter_x, (int32_t)h);
       break;
     default:
-      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
index eb1975e44..a3e967b40 100644
--- a/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve2_vert_dspr2.c
@@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h) {
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   uint32_t pos = 38;
 
   assert(y_step_q4 == 16);
@@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
       convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
       break;
     default:
-      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+      vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
       break;
   }
 }
diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
index b4ed6ee85..d9c2bef69 100644
--- a/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c
@@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
@@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
@@ -390,24 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
 
   if (intermediate_height < h) intermediate_height = h;
 
-  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x,
-                      x_step_q4, filter_y, y_step_q4, w, intermediate_height);
+  vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter,
+                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                      intermediate_height);
 
-  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x,
-                         x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int filter_x_stride,
-                            const int16_t *filter_y, int filter_y_stride, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
   int x, y;
   uint32_t tp1, tp2, tn1, tp3, tp4, tn2;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
index 9a9bab25a..fb68ad881 100644
--- a/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c
@@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
 
 void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                     h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride,
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
-                                  h);
+        vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_dspr2.c
index 8d35b6394..89f0f4196 100644
--- a/libvpx/vpx_dsp/mips/convolve8_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_dspr2.c
@@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
 }
 
 void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]);
   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
   uint32_t pos = 38;
@@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int filter_x_stride,
-                             const int16_t *filter_y, int filter_y_stride,
-                             int w, int h) {
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
   int x, y;
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   /* prefetch data to cache memory */
   prefetch_load(src);
diff --git a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
index 196a0a2f0..77e95c844 100644
--- a/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_horiz_dspr2.c
@@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride,
 
 void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   assert(x_step_q4 == 16);
   assert(((const int32_t *)filter_x)[1] != 0x800000);
 
   if (((const int32_t *)filter_x)[0] == 0) {
-    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                 (int32_t)dst_stride, filter_x, (int32_t)h);
         break;
       default:
-        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
index ad107d5c4..c329f71cc 100644
--- a/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
+++ b/libvpx/vpx_dsp/mips/convolve8_vert_dspr2.c
@@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   assert(y_step_q4 == 16);
   assert(((const int32_t *)filter_y)[1] != 0x800000);
 
   if (((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+    vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     uint32_t pos = 38;
 
@@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
         convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
index 4eee3bd5e..48e440d73 100644
--- a/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
+++ b/libvpx/vpx_dsp/mips/convolve_common_dspr2.h
@@ -24,21 +24,21 @@ extern "C" {
 #if HAVE_DSPR2
 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
-                               int h);
+                               const InterpKernel *filter, int x0_q4,
+                               int32_t x_step_q4, int y0_q4, int y_step_q4,
+                               int w, int h);
 
 void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int32_t x_step_q4, int y0_q4, int y_step_q4,
                                    int w, int h);
 
 void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h);
+                                  const InterpKernel *filter, int x0_q4,
+                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h);
 
 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter, int w,
@@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 
 void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
-                              int h);
+                              const InterpKernel *filter, int x0_q4,
+                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+                              int w, int h);
 
 #endif  // #if HAVE_DSPR2
 #ifdef __cplusplus
diff --git a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
index f786664bb..5a6dfcef2 100644
--- a/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
+++ b/libvpx/vpx_dsp/mips/fwd_txfm_msa.c
@@ -8,8 +8,23 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/fwd_txfm_msa.h"
 
+void vpx_fdct8x8_1_msa(const int16_t *input, tran_low_t *out, int32_t stride) {
+  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
+  v4i32 vec_w;
+
+  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
+  ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
+  ADD2(in0, in2, in4, in6, in0, in4);
+  vec_w = __msa_hadd_s_w(in0, in0);
+  vec_w += __msa_hadd_s_w(in4, in4);
+  out[0] = HADD_SW_S32(vec_w);
+  out[1] = 0;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
 void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
                         int32_t src_stride) {
   v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
@@ -215,19 +230,6 @@ void vpx_fdct8x8_msa(const int16_t *input, int16_t *output,
   ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, output, 8);
 }
 
-void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
-  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
-  v4i32 vec_w;
-
-  LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7);
-  ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6);
-  ADD2(in0, in2, in4, in6, in0, in4);
-  vec_w = __msa_hadd_s_w(in0, in0);
-  vec_w += __msa_hadd_s_w(in4, in4);
-  out[0] = HADD_SW_S32(vec_w);
-  out[1] = 0;
-}
-
 void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
                        int32_t src_stride) {
   int32_t i;
@@ -267,3 +269,4 @@ void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) {
   sum = HADD_SW_S32(vec_w);
   out[0] = (int16_t)(sum >> 1);
 }
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/libvpx/vpx_dsp/mips/itrans4_dspr2.c
index 3f985b847..e214b538d 100644
--- a/libvpx/vpx_dsp/mips/itrans4_dspr2.c
+++ b/libvpx/vpx_dsp/mips/itrans4_dspr2.c
@@ -343,6 +343,7 @@ void iadst4_dspr2(const int16_t *input, int16_t *output) {
     return;
   }
 
+  // 32-bit result is enough for the following multiplications.
   s0 = sinpi_1_9 * x0;
   s1 = sinpi_2_9 * x0;
   s2 = sinpi_3_9 * x1;
diff --git a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
index b73d56bd5..b1731f234 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_16_msa.c
@@ -8,13 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vpx_ports/mem.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
+#include "vpx_ports/mem.h"
 
-int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -77,7 +79,7 @@ int32_t vpx_hz_lpf_t4_and_t8_16w(uint8_t *src, int32_t pitch, uint8_t *filter48,
   }
 }
 
-void vpx_hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
+static void hz_lpf_t16_16w(uint8_t *src, int32_t pitch, uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -413,11 +415,11 @@ static void mb_lpf_horizontal_edge_dual(uint8_t *src, int32_t pitch,
 
   (void)count;
 
-  early_exit = vpx_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
-                                        limit_ptr, thresh_ptr);
+  early_exit = hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    vpx_hz_lpf_t16_16w(src, pitch, filter48);
+    hz_lpf_t16_16w(src, pitch, filter48);
   }
 }
 
@@ -753,11 +755,11 @@ static void transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output,
   ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
-                                uint8_t *src_org, int32_t pitch_org,
-                                const uint8_t *b_limit_ptr,
-                                const uint8_t *limit_ptr,
-                                const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
+                                   uint8_t *src_org, int32_t pitch_org,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -820,8 +822,8 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                          uint8_t *filter48) {
+static int32_t vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                             uint8_t *filter48) {
   v16i8 zero = { 0 };
   v16u8 filter8, flat, flat2;
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1051,12 +1053,12 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src,
-                              pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8), &filter48[0], src, pitch,
+                          b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
-                                   &filter48[0]);
+    early_exit =
+        vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
@@ -1064,11 +1066,11 @@ void vpx_lpf_vertical_16_msa(uint8_t *src, int32_t pitch,
   }
 }
 
-int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
-                                 uint8_t *src_org, int32_t pitch,
-                                 const uint8_t *b_limit_ptr,
-                                 const uint8_t *limit_ptr,
-                                 const uint8_t *thresh_ptr) {
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
+                                    uint8_t *src_org, int32_t pitch,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
   v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
   v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
   v16u8 flat, mask, hev, thresh, b_limit, limit;
@@ -1141,8 +1143,8 @@ int32_t vpx_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48,
   }
 }
 
-int32_t vpx_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
-                           uint8_t *filter48) {
+static int32_t vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, int32_t pitch,
+                              uint8_t *filter48) {
   v16u8 flat, flat2, filter8;
   v16i8 zero = { 0 };
   v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
@@ -1473,12 +1475,12 @@ void vpx_lpf_vertical_16_dual_msa(uint8_t *src, int32_t pitch,
   transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
 
   early_exit =
-      vpx_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
-                               pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
 
   if (0 == early_exit) {
-    early_exit = vpx_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
-                                    &filter48[0]);
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
 
     if (0 == early_exit) {
       transpose_16x16(transposed_input, 16, (src - 8), pitch);
diff --git a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
index 9500cd2fd..0eff2b6ca 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_4_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch,
diff --git a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
index a22c62bb3..703fcce8a 100644
--- a/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
+++ b/libvpx/vpx_dsp/mips/loopfilter_8_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/loopfilter_msa.h"
 
 void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch,
diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h
index 27b38865a..f9a446e7b 100644
--- a/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libvpx/vpx_dsp/mips/macros_msa.h
@@ -16,207 +16,149 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+#define LD_V(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
+#define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
+#define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
+
+#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
+#define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
+#define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LH(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint16_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lh  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LH(psrc)                                   \
+  ({                                               \
+    uint16_t val_lh_m = *(const uint16_t *)(psrc); \
+    val_lh_m;                                      \
   })
 
-#define LW(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint32_t val_m;                                       \
-                                                          \
-    __asm__ __volatile__("lw  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LW(psrc)                                   \
+  ({                                               \
+    uint32_t val_lw_m = *(const uint32_t *)(psrc); \
+    val_lw_m;                                      \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                          \
-  ({                                                      \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);      \
-    uint64_t val_m = 0;                                   \
-                                                          \
-    __asm__ __volatile__("ld  %[val_m],  %[psrc_m]  \n\t" \
-                                                          \
-                         : [val_m] "=r"(val_m)            \
-                         : [psrc_m] "m"(*psrc_m));        \
-                                                          \
-    val_m;                                                \
+#define LD(psrc)                                   \
+  ({                                               \
+    uint64_t val_ld_m = *(const uint64_t *)(psrc); \
+    val_ld_m;                                      \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint16_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SW(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint32_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
-
-#define SD(val, pdst)                                     \
-  {                                                       \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                  \
-    const uint64_t val_m = (val);                         \
-                                                          \
-    __asm__ __volatile__("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                          \
-                         : [pdst_m] "=m"(*pdst_m)         \
-                         : [val_m] "r"(val_m));           \
-  }
+#define SH(val, pdst) *(uint16_t *)(pdst) = (val);
+#define SW(val, pdst) *(uint32_t *)(pdst) = (val);
+#define SD(val, pdst) *(uint64_t *)(pdst) = (val);
 #else  // !(__mips_isa_rev >= 6)
-#define LH(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint16_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulh  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LH(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_lh_m = (const uint8_t *)(psrc);          \
+    uint16_t val_lh_m;                                           \
+                                                                 \
+    __asm__ __volatile__("ulh  %[val_lh_m],  %[psrc_lh_m]  \n\t" \
+                                                                 \
+                         : [val_lh_m] "=r"(val_lh_m)             \
+                         : [psrc_lh_m] "m"(*psrc_lh_m));         \
+                                                                 \
+    val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint32_t val_m;                                        \
-                                                           \
-    __asm__ __volatile__("ulw  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LW(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
+    uint32_t val_lw_m;                                           \
+                                                                 \
+    __asm__ __volatile__("ulw  %[val_lw_m],  %[psrc_lw_m]  \n\t" \
+                                                                 \
+                         : [val_lw_m] "=r"(val_lw_m)             \
+                         : [psrc_lw_m] "m"(*psrc_lw_m));         \
+                                                                 \
+    val_lw_m;                                                    \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                           \
-  ({                                                       \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);       \
-    uint64_t val_m = 0;                                    \
-                                                           \
-    __asm__ __volatile__("uld  %[val_m],  %[psrc_m]  \n\t" \
-                                                           \
-                         : [val_m] "=r"(val_m)             \
-                         : [psrc_m] "m"(*psrc_m));         \
-                                                           \
-    val_m;                                                 \
+#define LD(psrc)                                                 \
+  ({                                                             \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
+    uint64_t val_ld_m = 0;                                       \
+                                                                 \
+    __asm__ __volatile__("uld  %[val_ld_m],  %[psrc_ld_m]  \n\t" \
+                                                                 \
+                         : [val_ld_m] "=r"(val_ld_m)             \
+                         : [psrc_ld_m] "m"(*psrc_ld_m));         \
+                                                                 \
+    val_ld_m;                                                    \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                                              \
-  ({                                                                          \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);                         \
-    uint32_t val0_m, val1_m;                                                  \
-    uint64_t val_m_combined = 0;                                              \
-                                                                              \
-    val0_m = LW(psrc_m1);                                                     \
-    val1_m = LW(psrc_m1 + 4);                                                 \
-                                                                              \
-    val_m_combined = (uint64_t)(val1_m);                                      \
-    val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \
-    val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m);           \
-                                                                              \
-    val_m_combined;                                                           \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);           \
+    uint32_t val0_ld_m, val1_ld_m;                                \
+    uint64_t val_ld_m = 0;                                        \
+                                                                  \
+    val0_ld_m = LW(psrc_ld_m);                                    \
+    val1_ld_m = LW(psrc_ld_m + 4);                                \
+                                                                  \
+    val_ld_m = (uint64_t)(val1_ld_m);                             \
+    val_ld_m = (uint64_t)((val_ld_m << 32) & 0xFFFFFFFF00000000); \
+    val_ld_m = (uint64_t)(val_ld_m | (uint64_t)val0_ld_m);        \
+                                                                  \
+    val_ld_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint16_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sh_m = (uint8_t *)(pdst);                      \
+    const uint16_t val_sh_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("ush  %[val_sh_m],  %[pdst_sh_m]  \n\t" \
+                                                                 \
+                         : [pdst_sh_m] "=m"(*pdst_sh_m)          \
+                         : [val_sh_m] "r"(val_sh_m));            \
   }
 
-#define SW(val, pdst)                                      \
-  {                                                        \
-    uint8_t *pdst_m = (uint8_t *)(pdst);                   \
-    const uint32_t val_m = (val);                          \
-                                                           \
-    __asm__ __volatile__("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                           \
-                         : [pdst_m] "=m"(*pdst_m)          \
-                         : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                            \
+  {                                                              \
+    uint8_t *pdst_sw_m = (uint8_t *)(pdst);                      \
+    const uint32_t val_sw_m = (val);                             \
+                                                                 \
+    __asm__ __volatile__("usw  %[val_sw_m],  %[pdst_sw_m]  \n\t" \
+                                                                 \
+                         : [pdst_sw_m] "=m"(*pdst_sw_m)          \
+                         : [val_sw_m] "r"(val_sw_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *pdst_sd_m = (uint8_t *)(pdst);                     \
+    uint32_t val0_sd_m, val1_sd_m;                              \
+                                                                \
+    val0_sd_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    val1_sd_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(val0_sd_m, pdst_sd_m);                                   \
+    SW(val1_sd_m, pdst_sd_m + 4);                               \
   }
 #endif  // (__mips_isa_rev >= 6)
 
@@ -283,97 +225,73 @@
     SD(in3, (pdst) + 3 * stride);             \
   }
 
-/* Description : Load vectors with 16 byte elements with stride
+/* Description : Load vector elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Load 16 byte elements in 'out0' from (psrc)
                  Load 16 byte elements in 'out1' from (psrc + stride)
 */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) \
+#define LD_V2(RTYPE, psrc, stride, out0, out1) \
   {                                            \
-    out0 = LD_B(RTYPE, (psrc));                \
-    out1 = LD_B(RTYPE, (psrc) + stride);       \
+    out0 = LD_V(RTYPE, (psrc));                \
+    out1 = LD_V(RTYPE, (psrc) + stride);       \
   }
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
+#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
+#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
 
-#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
+#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
   {                                                  \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);        \
-    out2 = LD_B(RTYPE, (psrc) + 2 * stride);         \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);        \
+    out2 = LD_V(RTYPE, (psrc) + 2 * stride);         \
   }
-#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
+#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
   {                                                        \
-    LD_B2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_B2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+    LD_V2(RTYPE, (psrc), stride, out0, out1);              \
+    LD_V2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
   }
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
+#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
 
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
+#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
   {                                                              \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
-    out4 = LD_B(RTYPE, (psrc) + 4 * stride);                     \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);        \
+    out4 = LD_V(RTYPE, (psrc) + 4 * stride);                     \
   }
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
 
-#define LD_B7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
+#define LD_V7(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6) \
   {                                                                          \
-    LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
-    LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
+    LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);              \
+    LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);                   \
   }
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
 
-#define LD_B8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
+#define LD_V8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
               out7)                                                          \
   {                                                                          \
-    LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) \
-  {                                            \
-    out0 = LD_H(RTYPE, (psrc));                \
-    out1 = LD_H(RTYPE, (psrc) + (stride));     \
-  }
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
-  {                                                        \
-    LD_H2(RTYPE, (psrc), stride, out0, out1);              \
-    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
+    LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
+    LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
   }
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
+#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
 
-#define LD_H8(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6, \
-              out7)                                                          \
-  {                                                                          \
-    LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);                    \
-    LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);       \
-  }
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
+#define LD_V16(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5, out6,  \
                out7, out8, out9, out10, out11, out12, out13, out14, out15)     \
   {                                                                            \
-    LD_H8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
+    LD_V8(RTYPE, (psrc), stride, out0, out1, out2, out3, out4, out5, out6,     \
           out7);                                                               \
-    LD_H8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
+    LD_V8(RTYPE, (psrc) + 8 * stride, stride, out8, out9, out10, out11, out12, \
           out13, out14, out15);                                                \
   }
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
 
 /* Description : Load 4x4 block of signed halfword elements from 1D source
                  data into 4 vectors (Each vector with 4 signed halfwords)
@@ -388,79 +306,35 @@
     out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2); \
   }
 
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) \
-  {                                      \
-    out0 = LD_SW((psrc));                \
-    out1 = LD_SW((psrc) + stride);       \
-  }
-
-/* Description : Store vectors of 16 byte elements with stride
+/* Description : Store vectors with stride
    Arguments   : Inputs - in0, in1, pdst, stride
    Details     : Store 16 byte elements from 'in0' to (pdst)
                  Store 16 byte elements from 'in1' to (pdst + stride)
 */
-#define ST_B2(RTYPE, in0, in1, pdst, stride) \
-  {                                          \
-    ST_B(RTYPE, in0, (pdst));                \
-    ST_B(RTYPE, in1, (pdst) + stride);       \
-  }
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
-  {                                                      \
-    ST_B2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
-  }
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
-  {                                                                        \
-    ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
-    ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) \
+#define ST_V2(RTYPE, in0, in1, pdst, stride) \
   {                                          \
-    ST_H(RTYPE, in0, (pdst));                \
-    ST_H(RTYPE, in1, (pdst) + stride);       \
+    ST_V(RTYPE, in0, (pdst));                \
+    ST_V(RTYPE, in1, (pdst) + stride);       \
   }
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
+#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
+#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
 
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
+#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride)   \
   {                                                      \
-    ST_H2(RTYPE, in0, in1, (pdst), stride);              \
-    ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
+    ST_V2(RTYPE, in0, in1, (pdst), stride);              \
+    ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
   }
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
+#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
 
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
+#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
   {                                                                        \
-    ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                      \
-    ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
-  }
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) \
-  {                                    \
-    ST_SW(in0, (pdst));                \
-    ST_SW(in1, (pdst) + stride);       \
+    ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride);                        \
+    ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);         \
   }
+#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
+#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
 
 /* Description : Store 2x4 byte block to destination memory from input vector
    Arguments   : Inputs - in, stidx, pdst, stride
@@ -681,6 +555,7 @@
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
 
 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, out0, out1, out2, \
                 out3)                                                          \
@@ -1308,6 +1183,7 @@
     out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \
   }
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SB(...) ILVRL_W2(v16i8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
 
@@ -1721,6 +1597,25 @@
     out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in); \
   }
 
+/* Description : Sign extend byte elements from input vector and return
+                 halfword results in pair of vectors
+   Arguments   : Input   - in           (byte vector)
+                 Outputs - out0, out1   (sign extended halfword vectors)
+                 Return Type - signed halfword
+   Details     : Sign bit of byte elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 8 signed halfword elements in 'out1'
+*/
+#define UNPCK_SB_SH(in, out0, out1)       \
+  {                                       \
+    v16i8 tmp_m;                          \
+                                          \
+    tmp_m = __msa_clti_s_b((v16i8)in, 0); \
+    ILVRL_B2_SH(tmp_m, in, out0, out1);   \
+  }
+
 /* Description : Zero extend unsigned byte elements to halfword elements
    Arguments   : Input   - in          (unsigned byte vector)
                  Outputs - out0, out1  (unsigned  halfword vectors)
@@ -1879,8 +1774,6 @@
     out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
                                                                               \
     tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);              \
-    tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);              \
     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                \
@@ -2034,19 +1927,17 @@
 
 /* Description : Converts inputs to unsigned bytes, interleave, average & store
                  as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
+   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, pdst, stride
 */
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, dst2, dst3, \
-                                pdst, stride)                               \
-  {                                                                         \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                            \
-    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                    \
-    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                    \
-    ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);            \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                 \
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                           \
+    v16u8 tmp0_m, tmp1_m;                                                     \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                      \
+                                                                              \
+    tmp0_m = PCKEV_XORI128_UB(in0, in1);                                      \
+    tmp1_m = PCKEV_XORI128_UB(in2, in3);                                      \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);                  \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                                 \
   }
 
 /* Description : Pack even byte elements and store byte vector in destination
diff --git a/libvpx/vpx_dsp/mips/sad_mmi.c b/libvpx/vpx_dsp/mips/sad_mmi.c
new file mode 100644
index 000000000..33bd3fe7f
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -0,0 +1,805 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/asmdefs_mmi.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#define SAD_SRC_REF_ABS_SUB_64                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_32                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_16                                      \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_REF_ABS_SUB_8                                       \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "ulw        %[tmp0],    0x00(%[src])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_REF_ABS_SUB_4                                       \
+  "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+#define SAD_SRC_AVGREF_ABS_SUB_64                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x27(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x20(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x2f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x28(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x27(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x20(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x2f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x28(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x37(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x30(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x3f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x38(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x37(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x30(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x3f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x38(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_32                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x17(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x10(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x1f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x18(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x17(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x10(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x1f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x18(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_16                                   \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp3],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp3],   0x00(%[ref])                        \n\t" \
+  "gsldlc1    %[ftmp4],   0x0f(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp4],   0x08(%[ref])                        \n\t" \
+  "pavgb      %[ftmp3],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pavgb      %[ftmp4],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp3]            \n\t" \
+  "pasubub    %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp2],   %[ftmp2]                            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "paddw      %[ftmp5],   %[ftmp5],       %[ftmp2]            \n\t"
+
+#define SAD_SRC_AVGREF_ABS_SUB_8                                    \
+  "gsldlc1    %[ftmp1],   0x07(%[second_pred])                \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[ref])                        \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+
+#if _MIPS_SIM == _ABIO32
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "ulw        %[tmp0],    0x00(%[second_pred])                \n\t" \
+  "mtc1       %[tmp0],    %[ftmp1]                            \n\t" \
+  "ulw        %[tmp0],    0x00(%[ref])                        \n\t" \
+  "mtc1       %[tmp0],    %[ftmp2]                            \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#else /* _MIPS_SIM == _ABI64 || _MIPS_SIM == _ABIN32 */
+#define SAD_SRC_AVGREF_ABS_SUB_4                                    \
+  "gslwlc1    %[ftmp1],   0x03(%[second_pred])                \n\t" \
+  "gslwrc1    %[ftmp1],   0x00(%[second_pred])                \n\t" \
+  "gslwlc1    %[ftmp2],   0x03(%[ref])                        \n\t" \
+  "gslwrc1    %[ftmp2],   0x00(%[ref])                        \n\t" \
+  "pavgb      %[ftmp2],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t" \
+  "pasubub    %[ftmp1],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "mthc1      $0,         %[ftmp1]                            \n\t" \
+  "biadd      %[ftmp1],   %[ftmp1]                            \n\t" \
+  "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
+#endif /* _MIPS_SIM == _ABIO32 */
+
+// depending on call sites, pass **ref_array to avoid & in subsequent call and
+// de-dup with 4D below.
+#define sadMxNxK_mmi(m, n, k)                                                 \
+  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref_array, int ref_stride, \
+                                    uint32_t *sad_array) {                    \
+    int i;                                                                    \
+    for (i = 0; i < k; ++i)                                                   \
+      sad_array[i] =                                                          \
+          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
+  }
+
+// This appears to be equivalent to the above when k == 4 and refs is const
+#define sadMxNx4D_mmi(m, n)                                                  \
+  void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
+                                 const uint8_t *const ref_array[],           \
+                                 int ref_stride, uint32_t *sad_array) {      \
+    int i;                                                                   \
+    for (i = 0; i < 4; ++i)                                                  \
+      sad_array[i] =                                                         \
+          vpx_sad##m##x##n##_mmi(src, src_stride, ref_array[i], ref_stride); \
+  }
+
+static inline unsigned int vpx_sad64x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_64
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad64x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad64xN(64);
+vpx_sad64xN(32);
+sadMxNx4D_mmi(64, 64);
+sadMxNx4D_mmi(64, 32);
+
+static inline unsigned int vpx_sad_avg64x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_64
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x40)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg64xN(H)                                                   \
+  unsigned int vpx_sad64x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg64x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg64xN(64);
+vpx_sad_avg64xN(32);
+
+static inline unsigned int vpx_sad32x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_32
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad32x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad32xN(64);
+vpx_sad32xN(32);
+vpx_sad32xN(16);
+sadMxNx4D_mmi(32, 64);
+sadMxNx4D_mmi(32, 32);
+sadMxNx4D_mmi(32, 16);
+
+static inline unsigned int vpx_sad_avg32x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_32
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x20)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg32xN(H)                                                   \
+  unsigned int vpx_sad32x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg32x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg32xN(64);
+vpx_sad_avg32xN(32);
+vpx_sad_avg32xN(16);
+
+static inline unsigned int vpx_sad16x(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_16
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref), [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                   const uint8_t *ref, int ref_stride) { \
+    return vpx_sad16x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad16xN(32);
+vpx_sad16xN(16);
+vpx_sad16xN(8);
+sadMxNxK_mmi(16, 16, 3);
+sadMxNxK_mmi(16, 16, 8);
+sadMxNxK_mmi(16, 8, 3);
+sadMxNxK_mmi(16, 8, 8);
+sadMxNx4D_mmi(16, 32);
+sadMxNx4D_mmi(16, 16);
+sadMxNx4D_mmi(16, 8);
+
+static inline unsigned int vpx_sad_avg16x(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred,
+                                          int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3, ftmp4, ftmp5;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp5],   %[ftmp5],       %[ftmp5]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_16
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x10)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp5]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [ftmp4]"=&f"(ftmp4), [ftmp5]"=&f"(ftmp5), [counter]"+&r"(l_counter),
+      [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg16xN(H)                                                   \
+  unsigned int vpx_sad16x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride,   \
+                                       const uint8_t *second_pred) {         \
+    return vpx_sad_avg16x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg16xN(32);
+vpx_sad_avg16xN(16);
+vpx_sad_avg16xN(8);
+
+static inline unsigned int vpx_sad8x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_8
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad8x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad8xN(16);
+vpx_sad8xN(8);
+vpx_sad8xN(4);
+sadMxNxK_mmi(8, 16, 3);
+sadMxNxK_mmi(8, 16, 8);
+sadMxNxK_mmi(8, 8, 3);
+sadMxNxK_mmi(8, 8, 8);
+sadMxNx4D_mmi(8, 16);
+sadMxNx4D_mmi(8, 8);
+sadMxNx4D_mmi(8, 4);
+
+static inline unsigned int vpx_sad_avg8x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_8
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x08)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg8xN(H)                                                   \
+  unsigned int vpx_sad8x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg8x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg8xN(16);
+vpx_sad_avg8xN(8);
+vpx_sad_avg8xN(4);
+
+static inline unsigned int vpx_sad4x(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_REF_ABS_SUB_4
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_mmi(const uint8_t *src, int src_stride,   \
+                                  const uint8_t *ref, int ref_stride) { \
+    return vpx_sad4x(src, src_stride, ref, ref_stride, H);              \
+  }
+
+vpx_sad4xN(8);
+vpx_sad4xN(4);
+sadMxNxK_mmi(4, 4, 3);
+sadMxNxK_mmi(4, 4, 8);
+sadMxNx4D_mmi(4, 8);
+sadMxNx4D_mmi(4, 4);
+
+static inline unsigned int vpx_sad_avg4x(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred,
+                                         int counter) {
+  unsigned int sad;
+  double ftmp1, ftmp2, ftmp3;
+  mips_reg l_counter = counter;
+
+  __asm__ volatile (
+    "xor        %[ftmp3],   %[ftmp3],       %[ftmp3]            \n\t"
+    "1:                                                         \n\t"
+    // Include two loop body, to reduce loop time.
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    SAD_SRC_AVGREF_ABS_SUB_4
+    MMI_ADDIU(%[second_pred], %[second_pred], 0x04)
+    MMI_ADDU(%[src],     %[src],         %[src_stride])
+    MMI_ADDU(%[ref],     %[ref],         %[ref_stride])
+    MMI_ADDIU(%[counter], %[counter], -0x02)
+    "bnez       %[counter], 1b                                  \n\t"
+    "mfc1       %[sad],     %[ftmp3]                            \n\t"
+    : [ftmp1]"=&f"(ftmp1), [ftmp2]"=&f"(ftmp2), [ftmp3]"=&f"(ftmp3),
+      [counter]"+&r"(l_counter), [src]"+&r"(src), [ref]"+&r"(ref),
+      [second_pred]"+&r"((mips_reg)second_pred),
+      [sad]"=&r"(sad)
+    : [src_stride]"r"((mips_reg)src_stride),
+      [ref_stride]"r"((mips_reg)ref_stride)
+  );
+
+  return sad;
+}
+
+#define vpx_sad_avg4xN(H)                                                   \
+  unsigned int vpx_sad4x##H##_avg_mmi(const uint8_t *src, int src_stride,   \
+                                      const uint8_t *ref, int ref_stride,   \
+                                      const uint8_t *second_pred) {         \
+    return vpx_sad_avg4x(src, src_stride, ref, ref_stride, second_pred, H); \
+  }
+
+vpx_sad_avg4xN(8);
+vpx_sad_avg4xN(4);
diff --git a/libvpx/vpx_dsp/mips/sad_msa.c b/libvpx/vpx_dsp/mips/sad_msa.c
index e295123ac..ab681ae9f 100644
--- a/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/libvpx/vpx_dsp/mips/sad_msa.c
@@ -283,96 +283,6 @@ static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
   sad_array[2] = HADD_UH_U32(sad2);
 }
 
-static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = height >> 1; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v4u32 sad;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
-    ref0_4 = LD_UB(ref + 64);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32((v4i32)sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32((v4i32)sad);
-}
-
 static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
                               const uint8_t *ref_ptr, int32_t ref_stride,
                               int32_t height, uint32_t *sad_array) {
@@ -623,176 +533,6 @@ static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
   sad_array[7] = HADD_UH_U32(sad7);
 }
 
-static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1;
-  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB2(src, 16, src0, src1);
-    src += src_stride;
-    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
-    ref += ref_stride;
-
-    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
-                               const uint8_t *ref, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  const uint8_t *src_dup, *ref_dup;
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 sad0_0 = { 0 };
-  v8u16 sad0_1 = { 0 };
-  v8u16 sad1_0 = { 0 };
-  v8u16 sad1_1 = { 0 };
-  v8u16 sad2_0 = { 0 };
-  v8u16 sad2_1 = { 0 };
-  v8u16 sad3_0 = { 0 };
-  v8u16 sad3_1 = { 0 };
-  v4u32 sad;
-
-  src_dup = src;
-  ref_dup = ref;
-
-  for (ht_cnt = height; ht_cnt--;) {
-    LD_UB4(src, 16, src0, src1, src2, src3);
-    src += src_stride;
-    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref += ref_stride;
-
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[0] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[1] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[2] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[3] = HADD_SW_S32(sad);
-
-  sad0_0 = (v8u16)__msa_ldi_h(0);
-  sad0_1 = (v8u16)__msa_ldi_h(0);
-  sad1_0 = (v8u16)__msa_ldi_h(0);
-  sad1_1 = (v8u16)__msa_ldi_h(0);
-  sad2_0 = (v8u16)__msa_ldi_h(0);
-  sad2_1 = (v8u16)__msa_ldi_h(0);
-  sad3_0 = (v8u16)__msa_ldi_h(0);
-  sad3_1 = (v8u16)__msa_ldi_h(0);
-
-  for (ht_cnt = 64; ht_cnt--;) {
-    LD_UB4(src_dup, 16, src0, src1, src2, src3);
-    src_dup += src_stride;
-    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
-    ref_dup += ref_stride;
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
-    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
-    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
-    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-
-    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
-    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
-    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
-  }
-
-  sad = __msa_hadd_u_w(sad0_0, sad0_0);
-  sad += __msa_hadd_u_w(sad0_1, sad0_1);
-  sad_array[4] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad1_0, sad1_0);
-  sad += __msa_hadd_u_w(sad1_1, sad1_1);
-  sad_array[5] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad2_0, sad2_0);
-  sad += __msa_hadd_u_w(sad2_1, sad2_1);
-  sad_array[6] = HADD_SW_S32(sad);
-
-  sad = __msa_hadd_u_w(sad3_0, sad3_0);
-  sad += __msa_hadd_u_w(sad3_1, sad3_1);
-  sad_array[7] = HADD_SW_S32(sad);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1318,20 +1058,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
-#define VPX_SAD_32xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
   void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
                                  const uint8_t *ref, int32_t ref_stride, \
@@ -1353,20 +1079,6 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
     sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
   }
 
-#define VPX_SAD_32xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_64xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
                                   const uint8_t *const refs[],            \
@@ -1444,43 +1156,31 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
 
 // 64x64
 VPX_SAD_64xHEIGHT_MSA(64);
-VPX_SAD_64xHEIGHTx3_MSA(64);
-VPX_SAD_64xHEIGHTx8_MSA(64);
 VPX_SAD_64xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_64xHEIGHT_MSA(64);
 
 // 64x32
 VPX_SAD_64xHEIGHT_MSA(32);
-VPX_SAD_64xHEIGHTx3_MSA(32);
-VPX_SAD_64xHEIGHTx8_MSA(32);
 VPX_SAD_64xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_64xHEIGHT_MSA(32);
 
 // 32x64
 VPX_SAD_32xHEIGHT_MSA(64);
-VPX_SAD_32xHEIGHTx3_MSA(64);
-VPX_SAD_32xHEIGHTx8_MSA(64);
 VPX_SAD_32xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_32xHEIGHT_MSA(64);
 
 // 32x32
 VPX_SAD_32xHEIGHT_MSA(32);
-VPX_SAD_32xHEIGHTx3_MSA(32);
-VPX_SAD_32xHEIGHTx8_MSA(32);
 VPX_SAD_32xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_32xHEIGHT_MSA(32);
 
 // 32x16
 VPX_SAD_32xHEIGHT_MSA(16);
-VPX_SAD_32xHEIGHTx3_MSA(16);
-VPX_SAD_32xHEIGHTx8_MSA(16);
 VPX_SAD_32xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_32xHEIGHT_MSA(16);
 
 // 16x32
 VPX_SAD_16xHEIGHT_MSA(32);
-VPX_SAD_16xHEIGHTx3_MSA(32);
-VPX_SAD_16xHEIGHTx8_MSA(32);
 VPX_SAD_16xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_16xHEIGHT_MSA(32);
 
@@ -1514,15 +1214,11 @@ VPX_AVGSAD_8xHEIGHT_MSA(8);
 
 // 8x4
 VPX_SAD_8xHEIGHT_MSA(4);
-VPX_SAD_8xHEIGHTx3_MSA(4);
-VPX_SAD_8xHEIGHTx8_MSA(4);
 VPX_SAD_8xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_8xHEIGHT_MSA(4);
 
 // 4x8
 VPX_SAD_4xHEIGHT_MSA(8);
-VPX_SAD_4xHEIGHTx3_MSA(8);
-VPX_SAD_4xHEIGHTx8_MSA(8);
 VPX_SAD_4xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_4xHEIGHT_MSA(8);
 
diff --git a/libvpx/vpx_dsp/mips/subtract_mmi.c b/libvpx/vpx_dsp/mips/subtract_mmi.c
new file mode 100644
index 000000000..9f361704a
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/subtract_mmi.c
@@ -0,0 +1,306 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+void vpx_subtract_block_mmi(int rows, int cols, int16_t *diff,
+                            ptrdiff_t diff_stride, const uint8_t *src,
+                            ptrdiff_t src_stride, const uint8_t *pred,
+                            ptrdiff_t pred_stride) {
+  double ftmp[13];
+  uint32_t tmp[1];
+
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp1]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp2]                            \n\t"
+#else
+            "gslwlc1    %[ftmp1],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp2],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp3]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp4]                            \n\t"
+#else
+            "gslwlc1    %[ftmp3],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp4],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp5]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp6]                            \n\t"
+#else
+            "gslwlc1    %[ftmp5],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp6],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+#endif
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+
+#if _MIPS_SIM == _ABIO32
+            "ulw        %[tmp0],    0x00(%[src])                        \n\t"
+            "mtc1       %[tmp0],    %[ftmp7]                            \n\t"
+            "ulw        %[tmp0],    0x00(%[pred])                       \n\t"
+            "mtc1       %[tmp0],    %[ftmp8]                            \n\t"
+#else
+            "gslwlc1    %[ftmp7],   0x03(%[src])                        \n\t"
+            "gslwrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gslwlc1    %[ftmp8],   0x03(%[pred])                       \n\t"
+            "gslwrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+#endif
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp10],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubh      %[ftmp11],  %[ftmp9],           %[ftmp10]       \n\t"
+            "gssdlc1    %[ftmp11],  0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp11],  0x00(%[diff])                       \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+#if _MIPS_SIM == _ABIO32
+              [tmp0] "=&r"(tmp[0]),
+#endif
+              [src] "+&r"(src), [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [src_stride] "r"((mips_reg)src_stride),
+              [pred_stride] "r"((mips_reg)pred_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 8:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x02                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp3],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp7],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x00(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 16:
+        __asm__ volatile(
+            "xor        %[ftmp0],   %[ftmp0],           %[ftmp0]        \n\t"
+            "li         %[tmp0],    0x08                                \n\t"
+            "1:                                                         \n\t"
+            "gsldlc1    %[ftmp1],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp1],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp2],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp2],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp3],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp3],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp4],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp4],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "gsldlc1    %[ftmp5],   0x07(%[src])                        \n\t"
+            "gsldrc1    %[ftmp5],   0x00(%[src])                        \n\t"
+            "gsldlc1    %[ftmp6],   0x07(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp6],   0x00(%[pred])                       \n\t"
+            "gsldlc1    %[ftmp7],   0x0f(%[src])                        \n\t"
+            "gsldrc1    %[ftmp7],   0x08(%[src])                        \n\t"
+            "gsldlc1    %[ftmp8],   0x0f(%[pred])                       \n\t"
+            "gsldrc1    %[ftmp8],   0x08(%[pred])                       \n\t"
+            MMI_ADDU(%[src], %[src], %[src_stride])
+            MMI_ADDU(%[pred], %[pred], %[pred_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp1],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp1],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp2],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp2],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp3],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp3],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp4],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp4],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "punpcklbh  %[ftmp9],   %[ftmp5],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp5],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp6],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp6],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x07(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x00(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x0f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x08(%[diff])                       \n\t"
+            "punpcklbh  %[ftmp9],   %[ftmp7],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp10],  %[ftmp7],           %[ftmp0]        \n\t"
+            "punpcklbh  %[ftmp11],  %[ftmp8],           %[ftmp0]        \n\t"
+            "punpckhbh  %[ftmp12],  %[ftmp8],           %[ftmp0]        \n\t"
+            "psubsh     %[ftmp9],   %[ftmp9],           %[ftmp11]       \n\t"
+            "psubsh     %[ftmp10],  %[ftmp10],          %[ftmp12]       \n\t"
+            "gssdlc1    %[ftmp9],   0x17(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp9],   0x10(%[diff])                       \n\t"
+            "gssdlc1    %[ftmp10],  0x1f(%[diff])                       \n\t"
+            "gssdrc1    %[ftmp10],  0x18(%[diff])                       \n\t"
+            MMI_ADDU(%[diff], %[diff], %[diff_stride])
+            "addiu      %[tmp0],    %[tmp0],            -0x01           \n\t"
+            "bnez       %[tmp0],    1b                                  \n\t"
+            : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]),
+              [ftmp2] "=&f"(ftmp[2]), [ftmp3] "=&f"(ftmp[3]),
+              [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+              [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]),
+              [ftmp8] "=&f"(ftmp[8]), [ftmp9] "=&f"(ftmp[9]),
+              [ftmp10] "=&f"(ftmp[10]), [ftmp11] "=&f"(ftmp[11]),
+              [ftmp12] "=&f"(ftmp[12]), [tmp0] "=&r"(tmp[0]), [src] "+&r"(src),
+              [pred] "+&r"(pred), [diff] "+&r"(diff)
+            : [pred_stride] "r"((mips_reg)pred_stride),
+              [src_stride] "r"((mips_reg)src_stride),
+              [diff_stride] "r"((mips_reg)(diff_stride * 2))
+            : "memory");
+        break;
+      case 32:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      case 64:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride,
+                             pred, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff, diff_stride, src, src_stride, pred,
+                         pred_stride);
+  }
+}
diff --git a/libvpx/vpx_dsp/mips/variance_mmi.c b/libvpx/vpx_dsp/mips/variance_mmi.c
new file mode 100644
index 000000000..4af60d363
--- /dev/null
+++ b/libvpx/vpx_dsp/mips/variance_mmi.c
@@ -0,0 +1,1280 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/variance.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+/* Use VARIANCE_SSE_SUM_8_FOR_W64 in vpx_variance64x64,vpx_variance64x32,
+   vpx_variance32x64. VARIANCE_SSE_SUM_8 will lead to sum overflow. */
+#define VARIANCE_SSE_SUM_8_FOR_W64                                  \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp6]            \n\t" \
+  "paddw      %[ftmp10],  %[ftmp10],      %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp5],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp3],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp5],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "punpcklhw  %[ftmp1],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp2],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "punpcklhw  %[ftmp7],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "punpckhhw  %[ftmp8],   %[ftmp6],       %[ftmp0]            \n\t" \
+  "psubw      %[ftmp4],   %[ftmp1],       %[ftmp7]            \n\t" \
+  "psubw      %[ftmp6],   %[ftmp2],       %[ftmp8]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp3]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_SUM_4                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp5],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "paddw      %[ftmp6],   %[ftmp6],       %[ftmp5]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp7],   %[ftmp7],       %[ftmp3]            \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp4]            \n\t"
+
+#define VARIANCE_SSE_SUM_8                                          \
+  /* sse */                                                         \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t" \
+                                                                    \
+  /* sum */                                                         \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp6],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp3]            \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp4]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp5]            \n\t" \
+  "paddh      %[ftmp12],  %[ftmp12],      %[ftmp6]            \n\t"
+
+#define VARIANCE_SSE_8                                              \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VARIANCE_SSE_16                                             \
+  VARIANCE_SSE_8                                                    \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t" \
+  "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t" \
+  "pasubub    %[ftmp3],   %[ftmp1],       %[ftmp2]            \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp3],       %[ftmp0]            \n\t" \
+  "pmaddhw    %[ftmp6],   %[ftmp4],       %[ftmp4]            \n\t" \
+  "pmaddhw    %[ftmp7],   %[ftmp5],       %[ftmp5]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp6]            \n\t" \
+  "paddw      %[ftmp8],   %[ftmp8],       %[ftmp7]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp2*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B                       \
+  /* calculate fdata3[0]~fdata3[3], store at ftmp4*/                \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[3] */                                  \
+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp0]            \n\t" \
+  "gssdrc1    %[ftmp4],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp2],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp3],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp4]            \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                       \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp8],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp9],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x01(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp10]           \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp11]           \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp2],   %[ftmp2],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp8],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp2],   %[ftmp2],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp2],   %[ftmp2],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp3],   %[ftmp3],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp9],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp3],   %[ftmp3],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "and        %[ftmp2],   %[ftmp2],       %[mask]             \n\t" \
+  "and        %[ftmp3],   %[ftmp3],       %[mask]             \n\t" \
+  "packushb   %[ftmp2],   %[ftmp2],       %[ftmp3]            \n\t" \
+  "gssdlc1    %[ftmp2],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp2],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                      \
+  /* calculate: temp2[0] ~ temp2[3] */                              \
+  "pmullh     %[ftmp8],   %[ftmp8],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp2],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp8],   %[ftmp8],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp8],   %[ftmp8],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[4] ~ temp2[7] */                              \
+  "pmullh     %[ftmp9],   %[ftmp9],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp3],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp9],   %[ftmp9],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp9],   %[ftmp9],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[0] ~ temp2[7] */                                  \
+  "and        %[ftmp8],   %[ftmp8],       %[mask]             \n\t" \
+  "and        %[ftmp9],   %[ftmp9],       %[mask]             \n\t" \
+  "packushb   %[ftmp8],   %[ftmp8],       %[ftmp9]            \n\t" \
+  "gssdlc1    %[ftmp8],   0x07(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp8],   0x00(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp2 and ftmp3*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp4 and ftmp5*/     \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp4],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp5],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp6],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp7],   %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_x0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp6],   %[ftmp6],       %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp7],   %[ftmp7],       %[filter_x1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp6]            \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp7]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B                      \
+  /* calculate fdata3[0]~fdata3[7], store at ftmp8 and ftmp9*/      \
+  VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B                             \
+                                                                    \
+  /* calculate fdata3[8]~fdata3[15], store at ftmp10 and ftmp11*/   \
+  "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp10],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp11],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "gsldlc1    %[ftmp1],   0x10(%[a])                          \n\t" \
+  "gsldrc1    %[ftmp1],   0x09(%[a])                          \n\t" \
+  "punpcklbh  %[ftmp12],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "punpckhbh  %[ftmp13],  %[ftmp1],       %[ftmp0]            \n\t" \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_x0]        \n\t" \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_x0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp12],  %[ftmp12],      %[filter_x1]        \n\t" \
+  "pmullh     %[ftmp13],  %[ftmp13],      %[filter_x1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp12]           \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp13]           \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp4],   %[ftmp4],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp10],      %[filter_y1]        \n\t" \
+  "paddh      %[ftmp4],   %[ftmp4],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp4],   %[ftmp4],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp5],   %[ftmp5],       %[filter_y0]        \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp11],       %[filter_y1]       \n\t" \
+  "paddh      %[ftmp5],   %[ftmp5],       %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp5],   %[ftmp5],       %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "and        %[ftmp4],   %[ftmp4],       %[mask]             \n\t" \
+  "and        %[ftmp5],   %[ftmp5],       %[mask]             \n\t" \
+  "packushb   %[ftmp4],   %[ftmp4],       %[ftmp5]            \n\t" \
+  "gssdlc1    %[ftmp4],   0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp4],   0x08(%[temp2_ptr])                  \n\t"
+
+#define VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B                     \
+  VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B                            \
+                                                                    \
+  /* calculate: temp2[8] ~ temp2[11] */                             \
+  "pmullh     %[ftmp10],  %[ftmp10],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp4],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp10],  %[ftmp10],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp10],  %[ftmp10],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* calculate: temp2[12] ~ temp2[15] */                            \
+  "pmullh     %[ftmp11],  %[ftmp11],      %[filter_y0]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ff_ph_40]         \n\t" \
+  "pmullh     %[ftmp1],   %[ftmp5],       %[filter_y1]        \n\t" \
+  "paddh      %[ftmp11],  %[ftmp11],      %[ftmp1]            \n\t" \
+  "psrlh      %[ftmp11],  %[ftmp11],      %[ftmp14]           \n\t" \
+                                                                    \
+  /* store: temp2[8] ~ temp2[15] */                                 \
+  "and        %[ftmp10],  %[ftmp10],      %[mask]             \n\t" \
+  "and        %[ftmp11],  %[ftmp11],      %[mask]             \n\t" \
+  "packushb   %[ftmp10],  %[ftmp10],      %[ftmp11]           \n\t" \
+  "gssdlc1    %[ftmp10],  0x0f(%[temp2_ptr])                  \n\t" \
+  "gssdrc1    %[ftmp10],  0x08(%[temp2_ptr])                  \n\t"
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+static inline uint32_t vpx_variance64x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x27(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x20(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x27(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x20(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x2f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x28(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x2f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x28(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x37(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x30(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x37(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x30(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x3f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x38(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x3f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x38(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [a]"+&r"(a),                      [b]"+&r"(b),
+      [sum]"=&r"(sum)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (64 * high));
+}
+
+#define VPX_VARIANCE64XN(n)                                         \
+  uint32_t vpx_variance64x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance64x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE64XN(64)
+VPX_VARIANCE64XN(32)
+
+uint32_t vpx_variance32x64_mmi(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, uint32_t *sse) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    "li         %[tmp0],    0x40                                \n\t"
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp9],   %[ftmp9],       %[ftmp9]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8_FOR_W64
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "mfc1       %[tmp1],    %[ftmp9]                            \n\t"
+    "mfhc1      %[tmp2],    %[ftmp9]                            \n\t"
+    "addu       %[sum],     %[tmp1],        %[tmp2]             \n\t"
+    "dsrl       %[ftmp1],   %[ftmp10],      %[ftmp11]           \n\t"
+    "paddw      %[ftmp1],   %[ftmp1],       %[ftmp10]           \n\t"
+    "swc1       %[ftmp1],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),              [tmp1]"=&r"(tmp[1]),
+      [tmp2]"=&r"(tmp[2]),
+      [a]"+&r"(a),                      [b]"+&r"(b),
+      [sum]"=&r"(sum)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / 2048);
+}
+
+static inline uint32_t vpx_variance32x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x17(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x10(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x17(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x10(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x1f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x18(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x1f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x18(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (32 * high));
+}
+
+#define VPX_VARIANCE32XN(n)                                         \
+  uint32_t vpx_variance32x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance32x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE32XN(32)
+VPX_VARIANCE32XN(16)
+
+static inline uint32_t vpx_variance16x(const uint8_t *a, int a_stride,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+    "gsldlc1    %[ftmp1],   0x0f(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x08(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x0f(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x08(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (16 * high));
+}
+
+#define VPX_VARIANCE16XN(n)                                         \
+  uint32_t vpx_variance16x##n##_mmi(const uint8_t *a, int a_stride, \
+                                    const uint8_t *b, int b_stride, \
+                                    uint32_t *sse) {                \
+    return vpx_variance16x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE16XN(32)
+VPX_VARIANCE16XN(16)
+VPX_VARIANCE16XN(8)
+
+static inline uint32_t vpx_variance8x(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[13];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "xor        %[ftmp10],  %[ftmp10],      %[ftmp10]           \n\t"
+    "xor        %[ftmp12],  %[ftmp12],      %[ftmp12]           \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp10],      %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp12],      %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp12],      %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [ftmp12]"=&f"(ftmp[12]),          [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (8 * high));
+}
+
+#define VPX_VARIANCE8XN(n)                                         \
+  uint32_t vpx_variance8x##n##_mmi(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) {                \
+    return vpx_variance8x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE8XN(16)
+VPX_VARIANCE8XN(8)
+VPX_VARIANCE8XN(4)
+
+static inline uint32_t vpx_variance4x(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      uint32_t *sse, int high) {
+  int sum;
+  double ftmp[12];
+  uint32_t tmp[3];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp10]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp6],   %[ftmp6],       %[ftmp6]            \n\t"
+    "xor        %[ftmp7],   %[ftmp7],       %[ftmp7]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+    "1:                                                         \n\t"
+    "gsldlc1    %[ftmp1],   0x07(%[a])                          \n\t"
+    "gsldrc1    %[ftmp1],   0x00(%[a])                          \n\t"
+    "gsldlc1    %[ftmp2],   0x07(%[b])                          \n\t"
+    "gsldrc1    %[ftmp2],   0x00(%[b])                          \n\t"
+    VARIANCE_SSE_SUM_4
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp6],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp6]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+
+    "punpcklhw  %[ftmp3],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp4],   %[ftmp7],       %[ftmp0]            \n\t"
+    "punpcklhw  %[ftmp5],   %[ftmp8],       %[ftmp0]            \n\t"
+    "punpckhhw  %[ftmp6],   %[ftmp8],       %[ftmp0]            \n\t"
+    "paddw      %[ftmp3],   %[ftmp3],       %[ftmp4]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp5]            \n\t"
+    "psubw      %[ftmp3],   %[ftmp3],       %[ftmp6]            \n\t"
+    "dsrl       %[ftmp0],   %[ftmp3],       %[ftmp10]           \n\t"
+    "paddw      %[ftmp0],   %[ftmp0],       %[ftmp3]            \n\t"
+    "swc1       %[ftmp0],   0x00(%[sum])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse), [sum]"r"(&sum)
+    : "memory"
+  );
+
+  return *sse - (((int64_t)sum * sum) / (4 * high));
+}
+
+#define VPX_VARIANCE4XN(n)                                         \
+  uint32_t vpx_variance4x##n##_mmi(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) {                \
+    return vpx_variance4x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+VPX_VARIANCE4XN(8)
+VPX_VARIANCE4XN(4)
+
+static inline uint32_t vpx_mse16x(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride, uint32_t *sse,
+                                  uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_16
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse16xN(n)                                         \
+  uint32_t vpx_mse16x##n##_mmi(const uint8_t *a, int a_stride, \
+                               const uint8_t *b, int b_stride, \
+                               uint32_t *sse) {                \
+    return vpx_mse16x(a, a_stride, b, b_stride, sse, n);       \
+  }
+
+vpx_mse16xN(16);
+vpx_mse16xN(8);
+
+static inline uint32_t vpx_mse8x(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, uint32_t *sse,
+                                 uint64_t high) {
+  double ftmp[12];
+  uint32_t tmp[1];
+
+  *sse = 0;
+
+  __asm__ volatile (
+    "li         %[tmp0],    0x20                                \n\t"
+    "mtc1       %[tmp0],    %[ftmp11]                           \n\t"
+    MMI_L(%[tmp0], %[high], 0x00)
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    "xor        %[ftmp8],   %[ftmp8],       %[ftmp8]            \n\t"
+
+    "1:                                                         \n\t"
+    VARIANCE_SSE_8
+
+    "addiu      %[tmp0],    %[tmp0],        -0x01               \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    MMI_ADDU(%[b], %[b], %[b_stride])
+    "bnez       %[tmp0],    1b                                  \n\t"
+
+    "dsrl       %[ftmp9],   %[ftmp8],       %[ftmp11]           \n\t"
+    "paddw      %[ftmp9],   %[ftmp9],       %[ftmp8]            \n\t"
+    "swc1       %[ftmp9],   0x00(%[sse])                        \n\t"
+    : [ftmp0]"=&f"(ftmp[0]),            [ftmp1]"=&f"(ftmp[1]),
+      [ftmp2]"=&f"(ftmp[2]),            [ftmp3]"=&f"(ftmp[3]),
+      [ftmp4]"=&f"(ftmp[4]),            [ftmp5]"=&f"(ftmp[5]),
+      [ftmp6]"=&f"(ftmp[6]),            [ftmp7]"=&f"(ftmp[7]),
+      [ftmp8]"=&f"(ftmp[8]),            [ftmp9]"=&f"(ftmp[9]),
+      [ftmp10]"=&f"(ftmp[10]),          [ftmp11]"=&f"(ftmp[11]),
+      [tmp0]"=&r"(tmp[0]),
+      [a]"+&r"(a),                      [b]"+&r"(b)
+    : [a_stride]"r"((mips_reg)a_stride),[b_stride]"r"((mips_reg)b_stride),
+      [high]"r"(&high), [sse]"r"(sse)
+    : "memory"
+  );
+
+  return *sse;
+}
+
+#define vpx_mse8xN(n)                                                          \
+  uint32_t vpx_mse8x##n##_mmi(const uint8_t *a, int a_stride,                  \
+                              const uint8_t *b, int b_stride, uint32_t *sse) { \
+    return vpx_mse8x(a, a_stride, b, b_stride, sse, n);                        \
+  }
+
+vpx_mse8xN(16);
+vpx_mse8xN(8);
+
+#define SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_sub_pixel_variance##W##x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                  \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters[xoffset]);       \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters[yoffset]);      \
+                                                                        \
+    return vpx_variance##W##x##H##_mmi(temp2, W, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR(64, 64)
+SUBPIX_VAR(64, 32)
+SUBPIX_VAR(32, 64)
+SUBPIX_VAR(32, 32)
+SUBPIX_VAR(32, 16)
+SUBPIX_VAR(16, 32)
+
+static inline void var_filter_block2d_bil_16x(const uint8_t *a, int a_stride,
+                                              int xoffset, int yoffset,
+                                              uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+
+    // fdata3: fdata3[0] ~ fdata3[15]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    // temp2: temp2[0] ~ temp2[15]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[15]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    // temp2+16*1: temp2[0] ~ temp2[15]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_16_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x10)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_16_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR16XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance16x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,      \
+      const uint8_t *b, int b_stride, uint32_t *sse) {               \
+    uint8_t temp2[16 * H];                                           \
+    var_filter_block2d_bil_16x(a, a_stride, xoffset, yoffset, temp2, \
+                               (H - 2) / 2);                         \
+                                                                     \
+    return vpx_variance16x##H##_mmi(temp2, 16, b, b_stride, sse);    \
+  }
+
+SUBPIX_VAR16XN(16)
+SUBPIX_VAR16XN(8)
+
+static inline void var_filter_block2d_bil_8x(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[15];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp14])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+
+    // fdata3: fdata3[0] ~ fdata3[7]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[7]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    // temp2+8*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_8_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x08)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_8_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [ftmp7] "=&f"(ftmp[7]), [ftmp8] "=&f"(ftmp[8]),
+      [ftmp9] "=&f"(ftmp[9]), [ftmp10] "=&f"(ftmp[10]),
+      [ftmp11] "=&f"(ftmp[11]), [ftmp12] "=&f"(ftmp[12]),
+      [ftmp13] "=&f"(ftmp[13]), [ftmp14] "=&f"(ftmp[14]),
+      [tmp0] "=&r"(tmp[0]), [a] "+&r"(a), [temp2_ptr] "+&r"(temp2_ptr),
+      [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR8XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance8x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp2[8 * H];                                           \
+    var_filter_block2d_bil_8x(a, a_stride, xoffset, yoffset, temp2, \
+                              (H - 2) / 2);                         \
+                                                                    \
+    return vpx_variance8x##H##_mmi(temp2, 8, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR8XN(16)
+SUBPIX_VAR8XN(8)
+SUBPIX_VAR8XN(4)
+
+static inline void var_filter_block2d_bil_4x(const uint8_t *a, int a_stride,
+                                             int xoffset, int yoffset,
+                                             uint8_t *temp2, int counter) {
+  uint8_t *temp2_ptr = temp2;
+  mips_reg l_counter = counter;
+  double ftmp[7];
+  mips_reg tmp[2];
+  DECLARE_ALIGNED(8, const uint64_t, ff_ph_40) = { 0x0040004000400040ULL };
+  DECLARE_ALIGNED(8, const uint64_t, mask) = { 0x00ff00ff00ff00ffULL };
+  const uint8_t *filter_x = bilinear_filters[xoffset];
+  const uint8_t *filter_y = bilinear_filters[yoffset];
+
+  __asm__ volatile (
+    "xor        %[ftmp0],   %[ftmp0],       %[ftmp0]            \n\t"
+    MMI_LI(%[tmp0], 0x07)
+    MMI_MTC1(%[tmp0], %[ftmp6])
+    "pshufh     %[filter_x0], %[filter_x0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_x1], %[filter_x1], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y0], %[filter_y0], %[ftmp0]            \n\t"
+    "pshufh     %[filter_y1], %[filter_y1], %[ftmp0]            \n\t"
+    // fdata3: fdata3[0] ~ fdata3[3]
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+
+    // fdata3 +a_stride*1: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    // temp2: temp2[0] ~ temp2[7]
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    // fdata3 +a_stride*2: fdata3[0] ~ fdata3[3]
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    // temp2+4*1: temp2[0] ~ temp2[7]
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+
+    "1:                                                         \n\t"
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_B
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_A
+
+    MMI_ADDU(%[a], %[a], %[a_stride])
+    VAR_FILTER_BLOCK2D_BIL_FIRST_PASS_4_A
+    MMI_ADDIU(%[temp2_ptr], %[temp2_ptr], 0x04)
+    VAR_FILTER_BLOCK2D_BIL_SECOND_PASS_4_B
+    "addiu      %[counter], %[counter],     -0x01               \n\t"
+    "bnez       %[counter], 1b                                  \n\t"
+    : [ftmp0] "=&f"(ftmp[0]), [ftmp1] "=&f"(ftmp[1]), [ftmp2] "=&f"(ftmp[2]),
+      [ftmp3] "=&f"(ftmp[3]), [ftmp4] "=&f"(ftmp[4]), [ftmp5] "=&f"(ftmp[5]),
+      [ftmp6] "=&f"(ftmp[6]), [tmp0] "=&r"(tmp[0]), [a] "+&r"(a),
+      [temp2_ptr] "+&r"(temp2_ptr), [counter]"+&r"(l_counter)
+    : [filter_x0] "f"((uint64_t)filter_x[0]),
+      [filter_x1] "f"((uint64_t)filter_x[1]),
+      [filter_y0] "f"((uint64_t)filter_y[0]),
+      [filter_y1] "f"((uint64_t)filter_y[1]),
+      [a_stride] "r"((mips_reg)a_stride), [ff_ph_40] "f"(ff_ph_40),
+      [mask] "f"(mask)
+    : "memory"
+  );
+}
+
+#define SUBPIX_VAR4XN(H)                                            \
+  uint32_t vpx_sub_pixel_variance4x##H##_mmi(                       \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,     \
+      const uint8_t *b, int b_stride, uint32_t *sse) {              \
+    uint8_t temp2[4 * H];                                           \
+    var_filter_block2d_bil_4x(a, a_stride, xoffset, yoffset, temp2, \
+                              (H - 2) / 2);                         \
+                                                                    \
+    return vpx_variance4x##H##_mmi(temp2, 4, b, b_stride, sse);     \
+  }
+
+SUBPIX_VAR4XN(8)
+SUBPIX_VAR4XN(4)
+
+#define SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_sub_pixel_avg_variance##W##x##H##_mmi(                   \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,         \
+      const uint8_t *b, int b_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                     \
+    uint16_t fdata3[(H + 1) * W];                                       \
+    uint8_t temp2[H * W];                                               \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                         \
+                                                                        \
+    var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                      bilinear_filters[xoffset]);       \
+    var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                       bilinear_filters[yoffset]);      \
+                                                                        \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
+                                                                        \
+    return vpx_variance##W##x##H##_mmi(temp3, W, b, b_stride, sse);     \
+  }
+
+SUBPIX_AVG_VAR(64, 64)
+SUBPIX_AVG_VAR(64, 32)
+SUBPIX_AVG_VAR(32, 64)
+SUBPIX_AVG_VAR(32, 32)
+SUBPIX_AVG_VAR(32, 16)
+SUBPIX_AVG_VAR(16, 32)
+SUBPIX_AVG_VAR(16, 16)
+SUBPIX_AVG_VAR(16, 8)
+SUBPIX_AVG_VAR(8, 16)
+SUBPIX_AVG_VAR(8, 8)
+SUBPIX_AVG_VAR(8, 4)
+SUBPIX_AVG_VAR(4, 8)
+SUBPIX_AVG_VAR(4, 4)
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
index ad2af2866..187a01342 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c
@@ -16,8 +16,9 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 dst0, dst1, dst2, dst3, res2, res3;
+  v16u8 dst0 = { 0 }, res;
   v16u8 mask0, mask1, mask2, mask3;
   v8i16 filt, res0, res1;
 
@@ -36,23 +37,23 @@ static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src,
   XORI_B4_128_SB(src0, src1, src2, src3);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, res0, res1);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   SRARI_H2_SH(res0, res1, FILTER_BITS);
   SAT_SH2_SH(res0, res1, 7);
-  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  XORI_B2_128_UB(res2, res3);
-  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  res = PCKEV_XORI128_UB(res0, res1);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
   v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, vec0, vec1, vec2, vec3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[16]);
@@ -69,7 +70,10 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   XORI_B4_128_SB(src0, src1, src2, src3);
   src += (4 * src_stride);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
                              filt0, filt1, filt2, filt3, vec0, vec1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
@@ -82,10 +86,7 @@ static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
               res3);
   ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
   XORI_B2_128_UB(res0, res2);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
-  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
   ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
@@ -105,8 +106,9 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   int32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
-  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  v16u8 mask0, mask1, mask2, mask3, dst0 = { 0 }, dst1 = { 0 };
   v8i16 filt, out0, out1, out2, out3;
 
   mask0 = LD_UB(&mc_filt_mask_arr[0]);
@@ -127,10 +129,12 @@ static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src,
     HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
                                mask3, filt0, filt1, filt2, filt3, out0, out1,
                                out2, out3);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
   }
@@ -309,8 +313,9 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
+  v16u8 filt0, dst0 = { 0 }, vec0, vec1, res;
   v8u16 vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -320,23 +325,24 @@ static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB4(src, src_stride, src0, src1, src2, src3);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
   SRARI_H2_UH(vec2, vec3, FILTER_BITS);
-  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  res = (v16u8)__msa_pckev_b((v16i8)vec3, (v16i8)vec2);
+  res = (v16u8)__msa_aver_u_b(res, dst0);
+  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
   v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec4, vec5, vec6, vec7, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -346,7 +352,10 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h((v8i16)filt, 0);
 
   LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
   VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
@@ -354,13 +363,9 @@ static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(vec4, vec5, vec6, vec7, FILTER_BITS);
   PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2,
               res3);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
+  AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
+  ST4x8_UB(res0, res2, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -378,8 +383,9 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -394,16 +400,18 @@ static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hz_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
+  int64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, mask;
-  v16u8 filt0, dst0, dst1, dst2, dst3;
+  v16u8 filt0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 vec0, vec1, vec2, vec3, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[0]);
@@ -419,11 +427,12 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   LD_SB4(src, src_stride, src0, src1, src2, src3);
   src += (4 * src_stride);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -431,9 +440,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
               vec2, vec3);
   SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                     dst_stride);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
+  PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   dst += (4 * dst_stride);
 
   if (16 == height) {
@@ -445,10 +455,11 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     LD_SB4(src, src_stride, src0, src1, src2, src3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
@@ -456,9 +467,10 @@ static void common_hz_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
                 vec2, vec3);
     SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(vec0, vec1, vec2, vec3, dst0, dst1, dst, dst_stride);
   }
 }
 
@@ -633,9 +645,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -668,8 +681,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -695,8 +708,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_hor, h);
         break;
       default:
-        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                                  x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
index 1cfa63201..5187cea21 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_msa.c
@@ -16,8 +16,9 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
+  v16u8 dst0 = { 0 }, mask0, mask1, mask2, mask3, res;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
@@ -59,7 +60,8 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
     hz_out6 = (v8i16)__msa_sldi_b((v16i8)hz_out7, (v16i8)hz_out5, 8);
@@ -73,14 +75,12 @@ static void common_hv_8ht_8vt_and_aver_dst_4w_msa(
     vec4 = (v8i16)__msa_ilvev_b((v16i8)hz_out9, (v16i8)hz_out8);
     res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
                                filt_vt2, filt_vt3);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
 
     SRARI_H2_SH(res0, res1, FILTER_BITS);
     SAT_SH2_SH(res0, res1, 7);
-    PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
-    XORI_B2_128_UB(tmp0, tmp1);
-    AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
-    ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+    res = PCKEV_XORI128_UB(res0, res1);
+    res = (v16u8)__msa_aver_u_b(res, dst0);
+    ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     hz_out5 = hz_out9;
@@ -94,10 +94,11 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
   v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
   v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
-  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, mask0, mask1, mask2, mask3;
   v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
   v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
@@ -144,7 +145,9 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
     XORI_B4_128_SB(src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
 
     hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
                               filt_hz1, filt_hz2, filt_hz3);
@@ -172,7 +175,7 @@ static void common_hv_8ht_8vt_and_aver_dst_8w_msa(
 
     SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
     SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -225,9 +228,10 @@ static void common_hv_8ht_8vt_and_aver_dst_64w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
   v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 dst0, dst1, dst2, dst3, res0, res1;
+  v16u8 dst0 = { 0 }, out;
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
 
   mask = LD_SB(&mc_filt_mask_arr[16]);
@@ -248,21 +252,22 @@ static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(
   hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
   SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
-  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+  out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+  out = __msa_aver_u_b(out, dst0);
+  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
-  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
   v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -289,21 +294,18 @@ static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(
              hz_out3, hz_out5, 8);
   hz_out7 = (v8u16)__msa_pckod_d((v2i64)hz_out8, (v2i64)hz_out6);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2, dst4,
-             dst6);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
   ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt, tmp0,
               tmp1, tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1, res2,
-              res3);
-  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1, res2,
-              res3);
-  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
+  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
+  ST4x8_UB(res0, res1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
@@ -321,8 +323,9 @@ static void common_hv_2ht_2vt_and_aver_dst_4w_msa(
 static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert) {
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
+  v16u8 filt_hz, filt_vt, dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3;
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -338,7 +341,9 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
   src += (5 * src_stride);
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
   hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
   vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
@@ -357,16 +362,16 @@ static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(
   tmp3 = __msa_dotp_u_h(vec3, filt_vt);
 
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, mask;
-  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
+  v16u8 filt_hz, filt_vt, vec0, dst0 = { 0 }, dst1 = { 0 };
   v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -407,9 +412,10 @@ static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(
     tmp3 = __msa_dotp_u_h(vec0, filt_vt);
 
     SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                       dst_stride);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
   }
 }
@@ -516,9 +522,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa(
 
 void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -560,14 +567,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                &filt_hor[3], &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -596,8 +603,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                                                filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
index 146ce3b2f..ef8c90114 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c
@@ -17,8 +17,9 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3, out;
+  v16u8 dst0 = { 0 }, out;
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
   v16i8 src10998, filt0, filt1, filt2, filt3;
@@ -43,7 +44,8 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
     ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
@@ -55,9 +57,6 @@ static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src,
     SRARI_H2_SH(out10, out32, FILTER_BITS);
     SAT_SH2_SH(out10, out32, 7);
     out = PCKEV_XORI128_UB(out10, out32);
-    ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
-
-    dst0 = (v16u8)__msa_ilvr_d((v2i64)dst2, (v2i64)dst0);
     out = __msa_aver_u_b(out, dst0);
 
     ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
@@ -75,8 +74,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                              int32_t dst_stride, int8_t *filter,
                                              int32_t height) {
   uint32_t loop_cnt;
+  uint64_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
-  v16u8 dst0, dst1, dst2, dst3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
   v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
   v8i16 filt, out0, out1, out2, out3;
@@ -98,7 +98,9 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
     LD_SB4(src, src_stride, src7, src8, src9, src10);
     src += (4 * src_stride);
 
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
     XORI_B4_128_SB(src7, src8, src9, src10);
     ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
                src87_r, src98_r, src109_r);
@@ -112,7 +114,7 @@ static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src,
                                filt1, filt2, filt3);
     SRARI_H4_SH(out0, out1, out2, out3, FILTER_BITS);
     SAT_SH4_SH(out0, out1, out2, out3, 7);
-    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3, dst,
+    CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst,
                             dst_stride);
     dst += (4 * dst_stride);
 
@@ -246,8 +248,9 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  uint32_t tp0, tp1, tp2, tp3;
   v16i8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  v16u8 dst0 = { 0 }, out, filt0, src2110, src4332;
   v16i8 src10_r, src32_r, src21_r, src43_r;
   v8i16 filt;
   v8u16 tmp0, tmp1;
@@ -261,9 +264,8 @@ static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src,
   src4 = LD_SB(src);
   src += src_stride;
 
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
-  dst0 = (v16u8)__msa_ilvr_d((v2i64)dst1, (v2i64)dst0);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
@@ -280,7 +282,8 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
-  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  uint32_t tp0, tp1, tp2, tp3;
+  v16u8 dst0 = { 0 }, dst1 = { 0 };
   v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
   v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
   v16u8 src2110, src4332, src6554, src8776, filt0;
@@ -294,10 +297,10 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   src += (8 * src_stride);
   src8 = LD_SB(src);
 
-  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
-  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1, dst2,
-             dst3);
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
+  LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
+  LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
   ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
              src32_r, src43_r);
   ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
@@ -309,9 +312,7 @@ static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
   PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
   AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
-  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
-  dst += (4 * dst_stride);
-  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
+  ST4x8_UB(src2110, src4332, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_4w_msa(const uint8_t *src,
@@ -329,8 +330,9 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
                                               int32_t src_stride, uint8_t *dst,
                                               int32_t dst_stride,
                                               int8_t *filter) {
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4;
-  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, vec0, vec1, vec2, vec3, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
 
@@ -339,22 +341,24 @@ static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src,
   filt0 = (v16u8)__msa_splati_h(filt, 0);
 
   LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
-  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+  LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+  INSERT_D2_UB(tp0, tp1, dst0);
+  INSERT_D2_UB(tp2, tp3, dst1);
   ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
   ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
   DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
               tmp2, tmp3);
   SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3, dst,
-                     dst_stride);
+  PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
 }
 
 static void common_vt_2t_and_aver_dst_8x8mult_msa(
     const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
     int8_t *filter, int32_t height) {
   uint32_t loop_cnt;
+  int64_t tp0, tp1, tp2, tp3;
   v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
-  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
+  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
   v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
   v8u16 tmp0, tmp1, tmp2, tmp3;
   v8i16 filt;
@@ -369,7 +373,12 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
   for (loop_cnt = (height >> 3); loop_cnt--;) {
     LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
     src += (8 * src_stride);
-    LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
+    LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst0);
+    INSERT_D2_UB(tp2, tp3, dst1);
+    LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
+    INSERT_D2_UB(tp0, tp1, dst2);
+    INSERT_D2_UB(tp2, tp3, dst3);
 
     ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, vec0, vec1, vec2,
                vec3);
@@ -378,15 +387,13 @@ static void common_vt_2t_and_aver_dst_8x8mult_msa(
     DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3, dst4, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst, dst_stride);
     dst += (4 * dst_stride);
 
     DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1,
                 tmp2, tmp3);
     SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3, dst8, dst,
-                       dst_stride);
+    PCKEV_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst2, dst3, dst, dst_stride);
     dst += (4 * dst_stride);
 
     src0 = src8;
@@ -605,9 +612,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src,
 
 void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -640,8 +648,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -668,8 +676,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                                           (int32_t)dst_stride, filt_ver, h);
         break;
       default:
-        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                                 x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
index 9e8bf7b51..152dc2610 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_horiz_msa.c
@@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
+  const int16_t *const filter_x = filter[x0_q4];
   int8_t cnt, filt_hor[8];
 
   assert(x_step_q4 == 16);
@@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_hor[3], h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_hor, h);
         break;
       default:
-        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                              x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
index b16ec5788..d35a5a7a6 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_msa.c
@@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride,
 }
 
 void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int32_t x_step_q4, const int16_t *filter_y,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
                        int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_hor[8], filt_ver[8];
 
   assert(x_step_q4 == 16);
@@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   &filt_ver[3], (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else if (((const int32_t *)filter_x)[0] == 0 ||
              ((const int32_t *)filter_y)[0] == 0) {
-    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                    filter_y, y_step_q4, w, h);
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
   } else {
     switch (w) {
       case 4:
@@ -621,9 +623,605 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                                   (int32_t)h);
         break;
       default:
-        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
 }
+
+static void filter_horiz_w4_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 8, 16, 24, 4, 12, 20, 28, 1, 9, 17, 25, 5, 13, 21, 29 };
+  v16i8 shf2 = shf1 + 2;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, filt0, filt1, filt2, filt3;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(x_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_horiz_w8_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+
+  XORI_B4_128_SB(out0, out1, out2, out3);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_horiz_w16_msa(const uint8_t *src_x, ptrdiff_t src_pitch,
+                                 uint8_t *dst, const int16_t *x_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v16u8 src4 = { 0 }, src5 = { 0 }, src6 = { 0 }, src7 = { 0 };
+  v16u8 tmp0, tmp1, tmp2, tmp3, dst0;
+  v16i8 out0, out1, out2, out3, out4, out5, out6, out7;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+  v8i16 dst0_h, dst1_h, dst2_h, dst3_h;
+
+  LD4(src_x, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src0);
+  INSERT_D2_UB(srcd2, srcd3, src1);
+  LD4(src_x + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src2);
+  INSERT_D2_UB(srcd2, srcd3, src3);
+  LD4(src_x + 8 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src4);
+  INSERT_D2_UB(srcd2, srcd3, src5);
+  LD4(src_x + 12 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_UB(srcd0, srcd1, src6);
+  INSERT_D2_UB(srcd2, srcd3, src7);
+
+  filt = LD_SH(x_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  // transpose
+  VSHF_B2_UB(src0, src1, src0, src1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src2, src3, src2, src3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_SB(tmp3, tmp1, out2, out3);
+  XORI_B4_128_SB(out0, out1, out2, out3);
+
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+  UNPCK_SB_SH(out2, src4_h, src5_h);
+  UNPCK_SB_SH(out3, src6_h, src7_h);
+
+  VSHF_B2_UB(src4, src5, src4, src5, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(src6, src7, src6, src7, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_SB(tmp2, tmp0, out4, out5);
+  ILVRL_W2_SB(tmp3, tmp1, out6, out7);
+  XORI_B4_128_SB(out4, out5, out6, out7);
+
+  dst0_h = src0_h * filt0;
+  dst1_h = src4_h * filt4;
+  dst0_h += src1_h * filt1;
+  dst1_h += src5_h * filt5;
+  dst0_h += src2_h * filt2;
+  dst1_h += src6_h * filt6;
+  dst0_h += src3_h * filt3;
+  dst1_h += src7_h * filt7;
+
+  UNPCK_SB_SH(out4, src0_h, src1_h);
+  UNPCK_SB_SH(out5, src2_h, src3_h);
+  UNPCK_SB_SH(out6, src4_h, src5_h);
+  UNPCK_SB_SH(out7, src6_h, src7_h);
+
+  dst2_h = src0_h * filt0;
+  dst3_h = src4_h * filt4;
+  dst2_h += src1_h * filt1;
+  dst3_h += src5_h * filt5;
+  dst2_h += src2_h * filt2;
+  dst3_h += src6_h * filt6;
+  dst2_h += src3_h * filt3;
+  dst3_h += src7_h * filt7;
+
+  ADDS_SH2_SH(dst0_h, dst1_h, dst2_h, dst3_h, dst0_h, dst2_h);
+  SRARI_H2_SH(dst0_h, dst2_h, FILTER_BITS);
+  SAT_SH2_SH(dst0_h, dst2_h, 7);
+  dst0 = PCKEV_XORI128_UB(dst0_h, dst2_h);
+  ST_UB(dst0, dst);
+}
+
+static void transpose4x4_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0;
+  v16i8 out0 = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+  in0 = LD_UB(src);
+  out0 = __msa_vshf_b(out0, (v16i8)in0, (v16i8)in0);
+  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+}
+
+static void transpose8x8_to_dst(const uint8_t *src, uint8_t *dst,
+                                ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
+  v16i8 shf1 = { 0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27 };
+  v16i8 shf2 = shf1 + 4;
+
+  LD_UB4(src, 16, in0, in1, in2, in3);
+  VSHF_B2_UB(in0, in1, in0, in1, shf1, shf2, tmp0, tmp1);
+  VSHF_B2_UB(in2, in3, in2, in3, shf1, shf2, tmp2, tmp3);
+  ILVRL_W2_UB(tmp2, tmp0, out0, out1);
+  ILVRL_W2_UB(tmp3, tmp1, out2, out3);
+  ST8x4_UB(out0, out1, dst, dst_stride);
+  ST8x4_UB(out2, out3, dst + 4 * dst_stride, dst_stride);
+}
+
+static void transpose16x16_to_dst(const uint8_t *src, uint8_t *dst,
+                                  ptrdiff_t dst_stride) {
+  v16u8 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12;
+  v16u8 in13, in14, in15, out0, out1, out2, out3, out4, out5, out6, out7, out8;
+  v16u8 out9, out10, out11, out12, out13, out14, out15;
+
+  LD_UB8(src, 16, in0, in1, in2, in3, in4, in5, in6, in7);
+  LD_UB8(src + 16 * 8, 16, in8, in9, in10, in11, in12, in13, in14, in15);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out0, out1, out2, out3,
+                      out4, out5, out6, out7);
+  ST_UB8(out0, out1, out2, out3, out4, out5, out6, out7, dst, dst_stride);
+  dst += 8 * dst_stride;
+
+  SLDI_B4_0_UB(in0, in1, in2, in3, in0, in1, in2, in3, 8);
+  SLDI_B4_0_UB(in4, in5, in6, in7, in4, in5, in6, in7, 8);
+  SLDI_B4_0_UB(in8, in9, in10, in11, in8, in9, in10, in11, 8);
+  SLDI_B4_0_UB(in12, in13, in14, in15, in12, in13, in14, in15, 8);
+
+  TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                      in11, in12, in13, in14, in15, out8, out9, out10, out11,
+                      out12, out13, out14, out15);
+  ST_UB8(out8, out9, out10, out11, out12, out13, out14, out15, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 4; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w4_msa(src_x, src_stride, temp + (z * 4), x_filter);
+      } else {
+        for (i = 0; i < 4; ++i) {
+          temp[z * 4 + i] = src_x[i * src_stride + 3];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose4x4_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *x_filters, int x0_q4,
+                                    int x_step_q4, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int y, z, i;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (z = 0; z < 8; ++z) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+      if (x_q4 & SUBPEL_MASK) {
+        filter_horiz_w8_msa(src_x, src_stride, temp + (z * 8), x_filter);
+      } else {
+        for (i = 0; i < 8; ++i) {
+          temp[z * 8 + i] = src_x[3 + i * src_stride];
+        }
+      }
+
+      x_q4 += x_step_q4;
+    }
+
+    transpose8x8_to_dst(temp, dst, dst_stride);
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void scaledconvolve_horiz_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *x_filters, int x0_q4,
+                                       int x_step_q4, int w, int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[16 * 16]);
+  int x, y, z, i;
+
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 16x16 areas.  The intermediate height is not always
+  // a multiple of 16, so force it to be a multiple of 8 here.
+  y = h + (16 - (h & 0xF));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 16) {
+      for (z = 0; z < 16; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w16_msa(src_x, src_stride, temp + (z * 16), x_filter);
+        } else {
+          for (i = 0; i < 16; ++i) {
+            temp[z * 16 + i] = src_x[3 + i * src_stride];
+          }
+        }
+
+        x_q4 += x_step_q4;
+      }
+
+      transpose16x16_to_dst(temp, dst + x, dst_stride);
+    }
+
+    src += src_stride * 16;
+    dst += dst_stride * 16;
+  } while (y -= 16);
+}
+
+static void filter_vert_w4_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint32_t srcw0, srcw1, srcw2, srcw3, srcw4, srcw5, srcw6, srcw7;
+  uint32_t res;
+  v16u8 src0 = { 0 }, src1 = { 0 }, dst0;
+  v16i8 out0, out1;
+  v16i8 shf1 = { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+  v16i8 shf2 = shf1 + 8;
+  v16i8 filt_shf0 = { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 8, 9, 8, 9, 8, 9 };
+  v16i8 filt_shf1 = filt_shf0 + 2;
+  v16i8 filt_shf2 = filt_shf0 + 4;
+  v16i8 filt_shf3 = filt_shf0 + 6;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h;
+  v8i16 filt0, filt1, filt2, filt3;
+
+  LW4(src_y, src_pitch, srcw0, srcw1, srcw2, srcw3);
+  LW4(src_y + 4 * src_pitch, src_pitch, srcw4, srcw5, srcw6, srcw7);
+  INSERT_W4_UB(srcw0, srcw1, srcw2, srcw3, src0);
+  INSERT_W4_UB(srcw4, srcw5, srcw6, srcw7, src1);
+  VSHF_B2_SB(src0, src1, src0, src1, shf1, shf2, out0, out1);
+  XORI_B2_128_SB(out0, out1);
+  UNPCK_SB_SH(out0, src0_h, src1_h);
+  UNPCK_SB_SH(out1, src2_h, src3_h);
+
+  filt = LD_SH(y_filter);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf0, filt_shf1, filt0, filt1);
+  VSHF_B2_SH(filt, filt, filt, filt, filt_shf2, filt_shf3, filt2, filt3);
+
+  src0_h *= filt0;
+  src0_h += src1_h * filt1;
+  src0_h += src2_h * filt2;
+  src0_h += src3_h * filt3;
+
+  src1_h = (v8i16)__msa_sldi_b((v16i8)src0_h, (v16i8)src0_h, 8);
+
+  src0_h = __msa_adds_s_h(src0_h, src1_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  res = __msa_copy_u_w((v4i32)dst0, 0);
+  SW(res, dst);
+}
+
+static void filter_vert_w8_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                               uint8_t *dst, const int16_t *y_filter) {
+  uint64_t srcd0, srcd1, srcd2, srcd3;
+  v16u8 dst0;
+  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  LD4(src_y, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src0);
+  INSERT_D2_SB(srcd2, srcd3, src1);
+  LD4(src_y + 4 * src_pitch, src_pitch, srcd0, srcd1, srcd2, srcd3);
+  INSERT_D2_SB(srcd0, srcd1, src2);
+  INSERT_D2_SB(srcd2, srcd3, src3);
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  XORI_B4_128_SB(src0, src1, src2, src3);
+  UNPCK_SB_SH(src0, src0_h, src1_h);
+  UNPCK_SB_SH(src1, src2_h, src3_h);
+  UNPCK_SB_SH(src2, src4_h, src5_h);
+  UNPCK_SB_SH(src3, src6_h, src7_h);
+
+  src0_h *= filt0;
+  src4_h *= filt4;
+  src0_h += src1_h * filt1;
+  src4_h += src5_h * filt5;
+  src0_h += src2_h * filt2;
+  src4_h += src6_h * filt6;
+  src0_h += src3_h * filt3;
+  src4_h += src7_h * filt7;
+
+  src0_h = __msa_adds_s_h(src0_h, src4_h);
+  src0_h = __msa_srari_h(src0_h, FILTER_BITS);
+  src0_h = __msa_sat_s_h(src0_h, 7);
+  dst0 = PCKEV_XORI128_UB(src0_h, src0_h);
+  ST8x1_UB(dst0, dst);
+}
+
+static void filter_vert_mul_w16_msa(const uint8_t *src_y, ptrdiff_t src_pitch,
+                                    uint8_t *dst, const int16_t *y_filter,
+                                    int w) {
+  int x;
+  v16u8 dst0;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v8i16 filt, src0_h, src1_h, src2_h, src3_h, src4_h, src5_h, src6_h, src7_h;
+  v8i16 src8_h, src9_h, src10_h, src11_h, src12_h, src13_h, src14_h, src15_h;
+  v8i16 filt0, filt1, filt2, filt3, filt4, filt5, filt6, filt7;
+
+  filt = LD_SH(y_filter);
+  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
+  SPLATI_H4_SH(filt, 4, 5, 6, 7, filt4, filt5, filt6, filt7);
+
+  for (x = 0; x < w; x += 16) {
+    LD_SB8(src_y, src_pitch, src0, src1, src2, src3, src4, src5, src6, src7);
+    src_y += 16;
+
+    XORI_B4_128_SB(src0, src1, src2, src3);
+    XORI_B4_128_SB(src4, src5, src6, src7);
+    UNPCK_SB_SH(src0, src0_h, src1_h);
+    UNPCK_SB_SH(src1, src2_h, src3_h);
+    UNPCK_SB_SH(src2, src4_h, src5_h);
+    UNPCK_SB_SH(src3, src6_h, src7_h);
+    UNPCK_SB_SH(src4, src8_h, src9_h);
+    UNPCK_SB_SH(src5, src10_h, src11_h);
+    UNPCK_SB_SH(src6, src12_h, src13_h);
+    UNPCK_SB_SH(src7, src14_h, src15_h);
+
+    src0_h *= filt0;
+    src1_h *= filt0;
+    src8_h *= filt4;
+    src9_h *= filt4;
+    src0_h += src2_h * filt1;
+    src1_h += src3_h * filt1;
+    src8_h += src10_h * filt5;
+    src9_h += src11_h * filt5;
+    src0_h += src4_h * filt2;
+    src1_h += src5_h * filt2;
+    src8_h += src12_h * filt6;
+    src9_h += src13_h * filt6;
+    src0_h += src6_h * filt3;
+    src1_h += src7_h * filt3;
+    src8_h += src14_h * filt7;
+    src9_h += src15_h * filt7;
+
+    ADDS_SH2_SH(src0_h, src8_h, src1_h, src9_h, src0_h, src1_h);
+    SRARI_H2_SH(src0_h, src1_h, FILTER_BITS);
+    SAT_SH2_SH(src0_h, src1_h, 7);
+    dst0 = PCKEV_XORI128_UB(src0_h, src1_h);
+    ST_UB(dst0, dst);
+    dst += 16;
+  }
+}
+
+static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint32_t srcd = LW(src_y + 3 * src_stride);
+      SW(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *y_filters, int y0_q4,
+                                   int y_step_q4, int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_msa(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      uint64_t srcd = LD(src_y + 3 * src_stride);
+      SD(srcd, dst + y * dst_stride);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void scaledconvolve_vert_mul16(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *y_filters, int y0_q4,
+                                      int y_step_q4, int w, int h) {
+  int x, y;
+  int y_q4 = y0_q4;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (y = 0; y < h; ++y) {
+    const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_mul_w16_msa(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                              w);
+    } else {
+      for (x = 0; x < w; ++x) {
+        dst[x + y * dst_stride] = src_y[x + 3 * src_stride];
+      }
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+void vpx_scaled_2d_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                       int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if ((0 == x0_q4) && (16 == x_step_q4) && (0 == y0_q4) && (16 == y_step_q4)) {
+    vpx_convolve_copy_msa(src, src_stride, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    if (w >= 16) {
+      scaledconvolve_horiz_mul16(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                 src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                                 w, intermediate_height);
+    } else if (w == 8) {
+      scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    } else {
+      scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                              src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                              intermediate_height);
+    }
+
+    if (w >= 16) {
+      scaledconvolve_vert_mul16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                                dst_stride, filter, y0_q4, y_step_q4, w, h);
+    } else if (w == 8) {
+      scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    } else {
+      scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                             dst_stride, filter, y0_q4, y_step_q4, h);
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
index 410682271..13fce0077 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve8_vert_msa.c
@@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
+  const int16_t *const filter_y = filter[y0_q4];
   int8_t cnt, filt_ver[8];
 
   assert(y_step_q4 == 16);
@@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              &filt_ver[3], h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   } else {
@@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride,
                              filt_ver, h);
         break;
       default:
-        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                             x_step_q4, filter_y, y_step_q4, w, h);
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
         break;
     }
   }
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
index 45399bad8..ce649935d 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_avg_msa.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst,
@@ -188,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
index c3d87a4ab..c2ab33a2f 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_copy_msa.c
@@ -9,6 +9,7 @@
  */
 
 #include <string.h>
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
@@ -198,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
 
 void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
                            int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 4: {
diff --git a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
index f75679521..d53244596 100644
--- a/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
+++ b/libvpx/vpx_dsp/mips/vpx_convolve_msa.h
@@ -110,14 +110,13 @@ extern const uint8_t mc_filt_mask_arr[16 * 3];
     ST_UB(tmp_m, (pdst));                                 \
   }
 
-#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, \
-                           stride)                                           \
-  {                                                                          \
-    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-                                                                             \
-    PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m);                         \
-    PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                     \
-    AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);             \
-    ST8x4_UB(tmp0_m, tmp1_m, pdst, stride);                                  \
+#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, pdst, stride) \
+  {                                                                      \
+    v16u8 tmp0_m, tmp1_m;                                                \
+    uint8_t *pdst_m = (uint8_t *)(pdst);                                 \
+                                                                         \
+    PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m);             \
+    ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                            \
   }
 #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */
diff --git a/libvpx/vpx_dsp/ppc/hadamard_vsx.c b/libvpx/vpx_dsp/ppc/hadamard_vsx.c
index 435e3eb5b..e279b3047 100644
--- a/libvpx/vpx_dsp/ppc/hadamard_vsx.c
+++ b/libvpx/vpx_dsp/ppc/hadamard_vsx.c
@@ -42,7 +42,7 @@ static void vpx_hadamard_s16_8x8_one_pass(int16x8_t v[8]) {
   v[7] = vec_add(c1, c5);
 }
 
-void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_8x8_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
                           tran_low_t *coeff) {
   int16x8_t v[8];
 
@@ -71,7 +71,7 @@ void vpx_hadamard_8x8_vsx(const int16_t *src_diff, int src_stride,
   store_tran_low(v[7], 0, coeff + 56);
 }
 
-void vpx_hadamard_16x16_vsx(const int16_t *src_diff, int src_stride,
+void vpx_hadamard_16x16_vsx(const int16_t *src_diff, ptrdiff_t src_stride,
                             tran_low_t *coeff) {
   int i;
   const uint16x8_t ones = vec_splat_u16(1);
diff --git a/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
new file mode 100644
index 000000000..d43a9fd18
--- /dev/null
+++ b/libvpx/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -0,0 +1,1063 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static int16x8_t cospi1_v = { 16364, 16364, 16364, 16364,
+                              16364, 16364, 16364, 16364 };
+static int16x8_t cospi2_v = { 16305, 16305, 16305, 16305,
+                              16305, 16305, 16305, 16305 };
+static int16x8_t cospi3_v = { 16207, 16207, 16207, 16207,
+                              16207, 16207, 16207, 16207 };
+static int16x8_t cospi4_v = { 16069, 16069, 16069, 16069,
+                              16069, 16069, 16069, 16069 };
+static int16x8_t cospi4m_v = { -16069, -16069, -16069, -16069,
+                               -16069, -16069, -16069, -16069 };
+static int16x8_t cospi5_v = { 15893, 15893, 15893, 15893,
+                              15893, 15893, 15893, 15893 };
+static int16x8_t cospi6_v = { 15679, 15679, 15679, 15679,
+                              15679, 15679, 15679, 15679 };
+static int16x8_t cospi7_v = { 15426, 15426, 15426, 15426,
+                              15426, 15426, 15426, 15426 };
+static int16x8_t cospi8_v = { 15137, 15137, 15137, 15137,
+                              15137, 15137, 15137, 15137 };
+static int16x8_t cospi8m_v = { -15137, -15137, -15137, -15137,
+                               -15137, -15137, -15137, -15137 };
+static int16x8_t cospi9_v = { 14811, 14811, 14811, 14811,
+                              14811, 14811, 14811, 14811 };
+static int16x8_t cospi10_v = { 14449, 14449, 14449, 14449,
+                               14449, 14449, 14449, 14449 };
+static int16x8_t cospi11_v = { 14053, 14053, 14053, 14053,
+                               14053, 14053, 14053, 14053 };
+static int16x8_t cospi12_v = { 13623, 13623, 13623, 13623,
+                               13623, 13623, 13623, 13623 };
+static int16x8_t cospi13_v = { 13160, 13160, 13160, 13160,
+                               13160, 13160, 13160, 13160 };
+static int16x8_t cospi14_v = { 12665, 12665, 12665, 12665,
+                               12665, 12665, 12665, 12665 };
+static int16x8_t cospi15_v = { 12140, 12140, 12140, 12140,
+                               12140, 12140, 12140, 12140 };
+static int16x8_t cospi16_v = { 11585, 11585, 11585, 11585,
+                               11585, 11585, 11585, 11585 };
+static int16x8_t cospi17_v = { 11003, 11003, 11003, 11003,
+                               11003, 11003, 11003, 11003 };
+static int16x8_t cospi18_v = { 10394, 10394, 10394, 10394,
+                               10394, 10394, 10394, 10394 };
+static int16x8_t cospi19_v = { 9760, 9760, 9760, 9760, 9760, 9760, 9760, 9760 };
+static int16x8_t cospi20_v = { 9102, 9102, 9102, 9102, 9102, 9102, 9102, 9102 };
+static int16x8_t cospi20m_v = { -9102, -9102, -9102, -9102,
+                                -9102, -9102, -9102, -9102 };
+static int16x8_t cospi21_v = { 8423, 8423, 8423, 8423, 8423, 8423, 8423, 8423 };
+static int16x8_t cospi22_v = { 7723, 7723, 7723, 7723, 7723, 7723, 7723, 7723 };
+static int16x8_t cospi23_v = { 7005, 7005, 7005, 7005, 7005, 7005, 7005, 7005 };
+static int16x8_t cospi24_v = { 6270, 6270, 6270, 6270, 6270, 6270, 6270, 6270 };
+static int16x8_t cospi24_mv = { -6270, -6270, -6270, -6270,
+                                -6270, -6270, -6270, -6270 };
+static int16x8_t cospi25_v = { 5520, 5520, 5520, 5520, 5520, 5520, 5520, 5520 };
+static int16x8_t cospi26_v = { 4756, 4756, 4756, 4756, 4756, 4756, 4756, 4756 };
+static int16x8_t cospi27_v = { 3981, 3981, 3981, 3981, 3981, 3981, 3981, 3981 };
+static int16x8_t cospi28_v = { 3196, 3196, 3196, 3196, 3196, 3196, 3196, 3196 };
+static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
+static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
+static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+
+#define ROUND_SHIFT_INIT                                               \
+  const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
+  const uint32x4_t shift14 = vec_splat_u32(14);
+
+#define DCT_CONST_ROUND_SHIFT(vec) vec = vec_sra(vec_add(vec, shift), shift14);
+
+#define PIXEL_ADD_INIT               \
+  int16x8_t add8 = vec_splat_s16(8); \
+  uint16x8_t shift4 = vec_splat_u16(4);
+
+#define PIXEL_ADD4(out, in) out = vec_sra(vec_add(in, add8), shift4);
+
+#define IDCT4(in0, in1, out0, out1)                                           \
+  t0 = vec_add(in0, in1);                                                     \
+  t1 = vec_sub(in0, in1);                                                     \
+  tmp16_0 = vec_mergeh(t0, t1);                                               \
+  temp1 = vec_sra(vec_add(vec_mule(tmp16_0, cospi16_v), shift), shift14);     \
+  temp2 = vec_sra(vec_add(vec_mulo(tmp16_0, cospi16_v), shift), shift14);     \
+                                                                              \
+  tmp16_0 = vec_mergel(in0, in1);                                             \
+  temp3 = vec_sub(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp3);                                               \
+  temp4 = vec_add(vec_mule(tmp16_0, cospi8_v), vec_mulo(tmp16_0, cospi24_v)); \
+  DCT_CONST_ROUND_SHIFT(temp4);                                               \
+                                                                              \
+  step0 = vec_packs(temp1, temp2);                                            \
+  step1 = vec_packs(temp4, temp3);                                            \
+  out0 = vec_add(step0, step1);                                               \
+  out1 = vec_sub(step0, step1);                                               \
+  out1 = vec_perm(out1, out1, mask0);
+
+void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int32x4_t temp1, temp2, temp3, temp4;
+  int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
+  uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
+                       0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
+  uint8x16_t mask1 = { 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+                       0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
+  int16x8_t v0 = load_tran_low(0, input);
+  int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t t0 = vec_mergeh(v0, v1);
+  int16x8_t t1 = vec_mergel(v0, v1);
+
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t zerov = vec_splat_u8(0);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  uint8x16_t output_v;
+  uint8_t tmp_dest[16];
+  ROUND_SHIFT_INIT
+  PIXEL_ADD_INIT;
+
+  v0 = vec_mergeh(t0, t1);
+  v1 = vec_mergel(t0, t1);
+
+  IDCT4(v0, v1, t_out0, t_out1);
+  // transpose
+  t0 = vec_mergeh(t_out0, t_out1);
+  t1 = vec_mergel(t_out0, t_out1);
+  v0 = vec_mergeh(t0, t1);
+  v1 = vec_mergel(t0, t1);
+  IDCT4(v0, v1, t_out0, t_out1);
+
+  PIXEL_ADD4(v0, t_out0);
+  PIXEL_ADD4(v1, t_out1);
+  tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
+  tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
+  output_v = vec_packsu(tmp16_0, tmp16_1);
+
+  vec_vsx_st(output_v, 0, tmp_dest);
+  for (int i = 0; i < 4; i++)
+    for (int j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+}
+
+#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                     out3, out4, out5, out6, out7)                             \
+  out0 = vec_mergeh(in0, in1);                                                 \
+  out1 = vec_mergel(in0, in1);                                                 \
+  out2 = vec_mergeh(in2, in3);                                                 \
+  out3 = vec_mergel(in2, in3);                                                 \
+  out4 = vec_mergeh(in4, in5);                                                 \
+  out5 = vec_mergel(in4, in5);                                                 \
+  out6 = vec_mergeh(in6, in7);                                                 \
+  out7 = vec_mergel(in6, in7);                                                 \
+  in0 = (int16x8_t)vec_mergeh((int32x4_t)out0, (int32x4_t)out2);               \
+  in1 = (int16x8_t)vec_mergel((int32x4_t)out0, (int32x4_t)out2);               \
+  in2 = (int16x8_t)vec_mergeh((int32x4_t)out1, (int32x4_t)out3);               \
+  in3 = (int16x8_t)vec_mergel((int32x4_t)out1, (int32x4_t)out3);               \
+  in4 = (int16x8_t)vec_mergeh((int32x4_t)out4, (int32x4_t)out6);               \
+  in5 = (int16x8_t)vec_mergel((int32x4_t)out4, (int32x4_t)out6);               \
+  in6 = (int16x8_t)vec_mergeh((int32x4_t)out5, (int32x4_t)out7);               \
+  in7 = (int16x8_t)vec_mergel((int32x4_t)out5, (int32x4_t)out7);               \
+  out0 = vec_perm(in0, in4, tr8_mask0);                                        \
+  out1 = vec_perm(in0, in4, tr8_mask1);                                        \
+  out2 = vec_perm(in1, in5, tr8_mask0);                                        \
+  out3 = vec_perm(in1, in5, tr8_mask1);                                        \
+  out4 = vec_perm(in2, in6, tr8_mask0);                                        \
+  out5 = vec_perm(in2, in6, tr8_mask1);                                        \
+  out6 = vec_perm(in3, in7, tr8_mask0);                                        \
+  out7 = vec_perm(in3, in7, tr8_mask1);
+
+/* for the: temp1 = step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP8_0(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)             \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mule(tmp16_0, cospi0), vec_mulo(tmp16_0, cospi1)); \
+  temp11 = vec_sub(vec_mule(tmp16_1, cospi0), vec_mulo(tmp16_1, cospi1)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define STEP8_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_2 = vec_sub(inpt0, inpt1);                   \
+  tmp16_3 = vec_add(inpt0, inpt1);                   \
+  tmp16_0 = vec_mergeh(tmp16_2, tmp16_3);            \
+  tmp16_1 = vec_mergel(tmp16_2, tmp16_3);            \
+  temp10 = vec_mule(tmp16_0, cospi);                 \
+  temp11 = vec_mule(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt0 = vec_packs(temp10, temp11);                \
+  temp10 = vec_mulo(tmp16_0, cospi);                 \
+  temp11 = vec_mulo(tmp16_1, cospi);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                     \
+  DCT_CONST_ROUND_SHIFT(temp11);                     \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7)    \
+  /* stage 1 */                                          \
+  step0 = in0;                                           \
+  step2 = in4;                                           \
+  step1 = in2;                                           \
+  step3 = in6;                                           \
+                                                         \
+  STEP8_0(in1, in7, step4, step7, cospi28_v, cospi4_v);  \
+  STEP8_0(in5, in3, step5, step6, cospi12_v, cospi20_v); \
+                                                         \
+  /* stage 2 */                                          \
+  STEP8_1(step0, step2, in1, in0, cospi16_v);            \
+  STEP8_0(step1, step3, in2, in3, cospi24_v, cospi8_v);  \
+  in4 = vec_add(step4, step5);                           \
+  in5 = vec_sub(step4, step5);                           \
+  in6 = vec_sub(step7, step6);                           \
+  in7 = vec_add(step6, step7);                           \
+                                                         \
+  /* stage 3 */                                          \
+  step0 = vec_add(in0, in3);                             \
+  step1 = vec_add(in1, in2);                             \
+  step2 = vec_sub(in1, in2);                             \
+  step3 = vec_sub(in0, in3);                             \
+  step4 = in4;                                           \
+  STEP8_1(in6, in5, step5, step6, cospi16_v);            \
+  step7 = in7;                                           \
+                                                         \
+  /* stage 4 */                                          \
+  in0 = vec_add(step0, step7);                           \
+  in1 = vec_add(step1, step6);                           \
+  in2 = vec_add(step2, step5);                           \
+  in3 = vec_add(step3, step4);                           \
+  in4 = vec_sub(step3, step4);                           \
+  in5 = vec_sub(step2, step5);                           \
+  in6 = vec_sub(step1, step6);                           \
+  in7 = vec_sub(step0, step7);
+
+#define PIXEL_ADD(in, out, add, shiftx) \
+  out = vec_add(vec_sra(vec_add(in, add), shiftx), out);
+
+static uint8x16_t tr8_mask0 = {
+  0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
+  0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
+};
+static uint8x16_t tr8_mask1 = {
+  0x8,  0x9,  0xA,  0xB,  0xC,  0xD,  0xE,  0xF,
+  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F
+};
+void vpx_idct8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  int32x4_t temp10, temp11;
+  int16x8_t step0, step1, step2, step3, step4, step5, step6, step7;
+  int16x8_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp16_0, tmp16_1,
+      tmp16_2, tmp16_3;
+  int16x8_t src0 = load_tran_low(0, input);
+  int16x8_t src1 = load_tran_low(8 * sizeof(*input), input);
+  int16x8_t src2 = load_tran_low(16 * sizeof(*input), input);
+  int16x8_t src3 = load_tran_low(24 * sizeof(*input), input);
+  int16x8_t src4 = load_tran_low(32 * sizeof(*input), input);
+  int16x8_t src5 = load_tran_low(40 * sizeof(*input), input);
+  int16x8_t src6 = load_tran_low(48 * sizeof(*input), input);
+  int16x8_t src7 = load_tran_low(56 * sizeof(*input), input);
+  uint8x16_t dest0 = vec_vsx_ld(0, dest);
+  uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+  uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+  uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+  uint8x16_t dest4 = vec_vsx_ld(4 * stride, dest);
+  uint8x16_t dest5 = vec_vsx_ld(5 * stride, dest);
+  uint8x16_t dest6 = vec_vsx_ld(6 * stride, dest);
+  uint8x16_t dest7 = vec_vsx_ld(7 * stride, dest);
+  uint8x16_t zerov = vec_splat_u8(0);
+  int16x8_t d_u0 = (int16x8_t)vec_mergeh(dest0, zerov);
+  int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
+  int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
+  int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+  int16x8_t d_u4 = (int16x8_t)vec_mergeh(dest4, zerov);
+  int16x8_t d_u5 = (int16x8_t)vec_mergeh(dest5, zerov);
+  int16x8_t d_u6 = (int16x8_t)vec_mergeh(dest6, zerov);
+  int16x8_t d_u7 = (int16x8_t)vec_mergeh(dest7, zerov);
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(1));
+  uint16x8_t shift5 = vec_splat_u16(5);
+  uint8x16_t output0, output1, output2, output3;
+  ROUND_SHIFT_INIT;
+
+  TRANSPOSE8x8(src0, src1, src2, src3, src4, src5, src6, src7, tmp0, tmp1, tmp2,
+               tmp3, tmp4, tmp5, tmp6, tmp7);
+
+  IDCT8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  TRANSPOSE8x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, src2,
+               src3, src4, src5, src6, src7);
+  IDCT8(src0, src1, src2, src3, src4, src5, src6, src7);
+  PIXEL_ADD(src0, d_u0, add, shift5);
+  PIXEL_ADD(src1, d_u1, add, shift5);
+  PIXEL_ADD(src2, d_u2, add, shift5);
+  PIXEL_ADD(src3, d_u3, add, shift5);
+  PIXEL_ADD(src4, d_u4, add, shift5);
+  PIXEL_ADD(src5, d_u5, add, shift5);
+  PIXEL_ADD(src6, d_u6, add, shift5);
+  PIXEL_ADD(src7, d_u7, add, shift5);
+  output0 = vec_packsu(d_u0, d_u1);
+  output1 = vec_packsu(d_u2, d_u3);
+  output2 = vec_packsu(d_u4, d_u5);
+  output3 = vec_packsu(d_u6, d_u7);
+
+  vec_vsx_st(xxpermdi(output0, dest0, 1), 0, dest);
+  vec_vsx_st(xxpermdi(output0, dest1, 3), stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest2, 1), 2 * stride, dest);
+  vec_vsx_st(xxpermdi(output1, dest3, 3), 3 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest4, 1), 4 * stride, dest);
+  vec_vsx_st(xxpermdi(output2, dest5, 3), 5 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest6, 1), 6 * stride, dest);
+  vec_vsx_st(xxpermdi(output3, dest7, 3), 7 * stride, dest);
+}
+
+#define LOAD_INPUT16(load, source, offset, step, in0, in1, in2, in3, in4, in5, \
+                     in6, in7, in8, in9, inA, inB, inC, inD, inE, inF)         \
+  in0 = load(offset, source);                                                  \
+  in1 = load((step) + (offset), source);                                       \
+  in2 = load(2 * (step) + (offset), source);                                   \
+  in3 = load(3 * (step) + (offset), source);                                   \
+  in4 = load(4 * (step) + (offset), source);                                   \
+  in5 = load(5 * (step) + (offset), source);                                   \
+  in6 = load(6 * (step) + (offset), source);                                   \
+  in7 = load(7 * (step) + (offset), source);                                   \
+  in8 = load(8 * (step) + (offset), source);                                   \
+  in9 = load(9 * (step) + (offset), source);                                   \
+  inA = load(10 * (step) + (offset), source);                                  \
+  inB = load(11 * (step) + (offset), source);                                  \
+  inC = load(12 * (step) + (offset), source);                                  \
+  inD = load(13 * (step) + (offset), source);                                  \
+  inE = load(14 * (step) + (offset), source);                                  \
+  inF = load(15 * (step) + (offset), source);
+
+#define STEP16_1(inpt0, inpt1, outpt0, outpt1, cospi) \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                 \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                 \
+  temp10 = vec_mule(tmp16_0, cospi);                  \
+  temp11 = vec_mule(tmp16_1, cospi);                  \
+  temp20 = vec_mulo(tmp16_0, cospi);                  \
+  temp21 = vec_mulo(tmp16_1, cospi);                  \
+  temp30 = vec_sub(temp10, temp20);                   \
+  temp10 = vec_add(temp10, temp20);                   \
+  temp20 = vec_sub(temp11, temp21);                   \
+  temp21 = vec_add(temp11, temp21);                   \
+  DCT_CONST_ROUND_SHIFT(temp30);                      \
+  DCT_CONST_ROUND_SHIFT(temp20);                      \
+  outpt0 = vec_packs(temp30, temp20);                 \
+  DCT_CONST_ROUND_SHIFT(temp10);                      \
+  DCT_CONST_ROUND_SHIFT(temp21);                      \
+  outpt1 = vec_packs(temp10, temp21);
+
+#define IDCT16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, inA, inB,     \
+               inC, inD, inE, inF, out0, out1, out2, out3, out4, out5, out6,   \
+               out7, out8, out9, outA, outB, outC, outD, outE, outF)           \
+  /* stage 1 */                                                                \
+  /* out0 = in0; */                                                            \
+  out1 = in8;                                                                  \
+  out2 = in4;                                                                  \
+  out3 = inC;                                                                  \
+  out4 = in2;                                                                  \
+  out5 = inA;                                                                  \
+  out6 = in6;                                                                  \
+  out7 = inE;                                                                  \
+  out8 = in1;                                                                  \
+  out9 = in9;                                                                  \
+  outA = in5;                                                                  \
+  outB = inD;                                                                  \
+  outC = in3;                                                                  \
+  outD = inB;                                                                  \
+  outE = in7;                                                                  \
+  outF = inF;                                                                  \
+                                                                               \
+  /* stage 2 */                                                                \
+  /* in0 = out0; */                                                            \
+  in1 = out1;                                                                  \
+  in2 = out2;                                                                  \
+  in3 = out3;                                                                  \
+  in4 = out4;                                                                  \
+  in5 = out5;                                                                  \
+  in6 = out6;                                                                  \
+  in7 = out7;                                                                  \
+                                                                               \
+  STEP8_0(out8, outF, in8, inF, cospi30_v, cospi2_v);                          \
+  STEP8_0(out9, outE, in9, inE, cospi14_v, cospi18_v);                         \
+  STEP8_0(outA, outD, inA, inD, cospi22_v, cospi10_v);                         \
+  STEP8_0(outB, outC, inB, inC, cospi6_v, cospi26_v);                          \
+                                                                               \
+  /* stage 3 */                                                                \
+  out0 = in0;                                                                  \
+  out1 = in1;                                                                  \
+  out2 = in2;                                                                  \
+  out3 = in3;                                                                  \
+                                                                               \
+  STEP8_0(in4, in7, out4, out7, cospi28_v, cospi4_v);                          \
+  STEP8_0(in5, in6, out5, out6, cospi12_v, cospi20_v);                         \
+                                                                               \
+  out8 = vec_add(in8, in9);                                                    \
+  out9 = vec_sub(in8, in9);                                                    \
+  outA = vec_sub(inB, inA);                                                    \
+  outB = vec_add(inA, inB);                                                    \
+  outC = vec_add(inC, inD);                                                    \
+  outD = vec_sub(inC, inD);                                                    \
+  outE = vec_sub(inF, inE);                                                    \
+  outF = vec_add(inE, inF);                                                    \
+                                                                               \
+  /* stage 4 */                                                                \
+  STEP16_1(out0, out1, in1, in0, cospi16_v);                                   \
+  STEP8_0(out2, out3, in2, in3, cospi24_v, cospi8_v);                          \
+  in4 = vec_add(out4, out5);                                                   \
+  in5 = vec_sub(out4, out5);                                                   \
+  in6 = vec_sub(out7, out6);                                                   \
+  in7 = vec_add(out6, out7);                                                   \
+                                                                               \
+  in8 = out8;                                                                  \
+  inF = outF;                                                                  \
+  tmp16_0 = vec_mergeh(out9, outE);                                            \
+  tmp16_1 = vec_mergel(out9, outE);                                            \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  in9 = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi24_v), vec_mulo(tmp16_0, cospi8_v)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi24_v), vec_mulo(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inE = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  tmp16_0 = vec_mergeh(outA, outD);                                            \
+  tmp16_1 = vec_mergel(outA, outD);                                            \
+  temp10 =                                                                     \
+      vec_sub(vec_mule(tmp16_0, cospi24_mv), vec_mulo(tmp16_0, cospi8_v));     \
+  temp11 =                                                                     \
+      vec_sub(vec_mule(tmp16_1, cospi24_mv), vec_mulo(tmp16_1, cospi8_v));     \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inA = vec_packs(temp10, temp11);                                             \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi24_v), vec_mule(tmp16_0, cospi8_v)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi24_v), vec_mule(tmp16_1, cospi8_v)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                               \
+  DCT_CONST_ROUND_SHIFT(temp11);                                               \
+  inD = vec_packs(temp10, temp11);                                             \
+                                                                               \
+  inB = outB;                                                                  \
+  inC = outC;                                                                  \
+                                                                               \
+  /* stage 5 */                                                                \
+  out0 = vec_add(in0, in3);                                                    \
+  out1 = vec_add(in1, in2);                                                    \
+  out2 = vec_sub(in1, in2);                                                    \
+  out3 = vec_sub(in0, in3);                                                    \
+  out4 = in4;                                                                  \
+  STEP16_1(in6, in5, out5, out6, cospi16_v);                                   \
+  out7 = in7;                                                                  \
+                                                                               \
+  out8 = vec_add(in8, inB);                                                    \
+  out9 = vec_add(in9, inA);                                                    \
+  outA = vec_sub(in9, inA);                                                    \
+  outB = vec_sub(in8, inB);                                                    \
+  outC = vec_sub(inF, inC);                                                    \
+  outD = vec_sub(inE, inD);                                                    \
+  outE = vec_add(inD, inE);                                                    \
+  outF = vec_add(inC, inF);                                                    \
+                                                                               \
+  /* stage 6 */                                                                \
+  in0 = vec_add(out0, out7);                                                   \
+  in1 = vec_add(out1, out6);                                                   \
+  in2 = vec_add(out2, out5);                                                   \
+  in3 = vec_add(out3, out4);                                                   \
+  in4 = vec_sub(out3, out4);                                                   \
+  in5 = vec_sub(out2, out5);                                                   \
+  in6 = vec_sub(out1, out6);                                                   \
+  in7 = vec_sub(out0, out7);                                                   \
+  in8 = out8;                                                                  \
+  in9 = out9;                                                                  \
+  STEP16_1(outD, outA, inA, inD, cospi16_v);                                   \
+  STEP16_1(outC, outB, inB, inC, cospi16_v);                                   \
+  inE = outE;                                                                  \
+  inF = outF;                                                                  \
+                                                                               \
+  /* stage 7 */                                                                \
+  out0 = vec_add(in0, inF);                                                    \
+  out1 = vec_add(in1, inE);                                                    \
+  out2 = vec_add(in2, inD);                                                    \
+  out3 = vec_add(in3, inC);                                                    \
+  out4 = vec_add(in4, inB);                                                    \
+  out5 = vec_add(in5, inA);                                                    \
+  out6 = vec_add(in6, in9);                                                    \
+  out7 = vec_add(in7, in8);                                                    \
+  out8 = vec_sub(in7, in8);                                                    \
+  out9 = vec_sub(in6, in9);                                                    \
+  outA = vec_sub(in5, inA);                                                    \
+  outB = vec_sub(in4, inB);                                                    \
+  outC = vec_sub(in3, inC);                                                    \
+  outD = vec_sub(in2, inD);                                                    \
+  outE = vec_sub(in1, inE);                                                    \
+  outF = vec_sub(in0, inF);
+
+#define PIXEL_ADD_STORE16(in0, in1, dst, offset) \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);      \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);      \
+  PIXEL_ADD(in0, d_uh, add, shift6);             \
+  PIXEL_ADD(in1, d_ul, add, shift6);             \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), offset, dest);
+
+void vpx_idct16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  int16x8_t src00, src01, src02, src03, src04, src05, src06, src07, src10,
+      src11, src12, src13, src14, src15, src16, src17;
+  int16x8_t src20, src21, src22, src23, src24, src25, src26, src27, src30,
+      src31, src32, src33, src34, src35, src36, src37;
+  int16x8_t tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10,
+      tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp16_0, tmp16_1;
+  int16x8_t tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30,
+      tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37;
+  uint8x16_t dest0, dest1, dest2, dest3, dest4, dest5, dest6, dest7, dest8,
+      dest9, destA, destB, destC, destD, destE, destF;
+  int16x8_t d_uh, d_ul;
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+  uint16x8_t shift6 = vec_splat_u16(6);
+  uint8x16_t zerov = vec_splat_u8(0);
+  ROUND_SHIFT_INIT;
+
+  // transform rows
+  // load and transform the upper half of 16x16 matrix
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), src00, src10, src01,
+               src11, src02, src12, src03, src13, src04, src14, src05, src15,
+               src06, src16, src07, src17);
+  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp10, tmp11,
+         tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, src00, src01, src02, src03,
+         src04, src05, src06, src07, src10, src11, src12, src13, src14, src15,
+         src16, src17);
+  TRANSPOSE8x8(src00, src01, src02, src03, src04, src05, src06, src07, tmp00,
+               tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07);
+  TRANSPOSE8x8(src10, src11, src12, src13, src14, src15, src16, src17, tmp10,
+               tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17);
+
+  // load and transform the lower half of 16x16 matrix
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), src20, src30, src21, src31, src22, src32,
+               src23, src33, src24, src34, src25, src35, src26, src36, src27,
+               src37);
+  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+  IDCT16(tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, tmp30, tmp31,
+         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src20, src21, src22, src23,
+         src24, src25, src26, src27, src30, src31, src32, src33, src34, src35,
+         src36, src37);
+  TRANSPOSE8x8(src20, src21, src22, src23, src24, src25, src26, src27, tmp20,
+               tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27);
+  TRANSPOSE8x8(src30, src31, src32, src33, src34, src35, src36, src37, tmp30,
+               tmp31, tmp32, tmp33, tmp34, tmp35, tmp36, tmp37);
+
+  // transform columns
+  // left half first
+  IDCT16(tmp00, tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp20, tmp21,
+         tmp22, tmp23, tmp24, tmp25, tmp26, tmp27, src00, src01, src02, src03,
+         src04, src05, src06, src07, src20, src21, src22, src23, src24, src25,
+         src26, src27);
+  // right half
+  IDCT16(tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16, tmp17, tmp30, tmp31,
+         tmp32, tmp33, tmp34, tmp35, tmp36, tmp37, src10, src11, src12, src13,
+         src14, src15, src16, src17, src30, src31, src32, src33, src34, src35,
+         src36, src37);
+
+  // load dest
+  LOAD_INPUT16(vec_vsx_ld, dest, 0, stride, dest0, dest1, dest2, dest3, dest4,
+               dest5, dest6, dest7, dest8, dest9, destA, destB, destC, destD,
+               destE, destF);
+
+  PIXEL_ADD_STORE16(src00, src10, dest0, 0);
+  PIXEL_ADD_STORE16(src01, src11, dest1, stride);
+  PIXEL_ADD_STORE16(src02, src12, dest2, 2 * stride);
+  PIXEL_ADD_STORE16(src03, src13, dest3, 3 * stride);
+  PIXEL_ADD_STORE16(src04, src14, dest4, 4 * stride);
+  PIXEL_ADD_STORE16(src05, src15, dest5, 5 * stride);
+  PIXEL_ADD_STORE16(src06, src16, dest6, 6 * stride);
+  PIXEL_ADD_STORE16(src07, src17, dest7, 7 * stride);
+
+  PIXEL_ADD_STORE16(src20, src30, dest8, 8 * stride);
+  PIXEL_ADD_STORE16(src21, src31, dest9, 9 * stride);
+  PIXEL_ADD_STORE16(src22, src32, destA, 10 * stride);
+  PIXEL_ADD_STORE16(src23, src33, destB, 11 * stride);
+  PIXEL_ADD_STORE16(src24, src34, destC, 12 * stride);
+  PIXEL_ADD_STORE16(src25, src35, destD, 13 * stride);
+  PIXEL_ADD_STORE16(src26, src36, destE, 14 * stride);
+  PIXEL_ADD_STORE16(src27, src37, destF, 15 * stride);
+}
+
+#define LOAD_8x32(load, in00, in01, in02, in03, in10, in11, in12, in13, in20, \
+                  in21, in22, in23, in30, in31, in32, in33, in40, in41, in42, \
+                  in43, in50, in51, in52, in53, in60, in61, in62, in63, in70, \
+                  in71, in72, in73, offset)                                   \
+  /* load the first row from the 8x32 block*/                                 \
+  in00 = load(offset, input);                                                 \
+  in01 = load(offset + 16, input);                                            \
+  in02 = load(offset + 2 * 16, input);                                        \
+  in03 = load(offset + 3 * 16, input);                                        \
+                                                                              \
+  in10 = load(offset + 4 * 16, input);                                        \
+  in11 = load(offset + 5 * 16, input);                                        \
+  in12 = load(offset + 6 * 16, input);                                        \
+  in13 = load(offset + 7 * 16, input);                                        \
+                                                                              \
+  in20 = load(offset + 8 * 16, input);                                        \
+  in21 = load(offset + 9 * 16, input);                                        \
+  in22 = load(offset + 10 * 16, input);                                       \
+  in23 = load(offset + 11 * 16, input);                                       \
+                                                                              \
+  in30 = load(offset + 12 * 16, input);                                       \
+  in31 = load(offset + 13 * 16, input);                                       \
+  in32 = load(offset + 14 * 16, input);                                       \
+  in33 = load(offset + 15 * 16, input);                                       \
+                                                                              \
+  in40 = load(offset + 16 * 16, input);                                       \
+  in41 = load(offset + 17 * 16, input);                                       \
+  in42 = load(offset + 18 * 16, input);                                       \
+  in43 = load(offset + 19 * 16, input);                                       \
+                                                                              \
+  in50 = load(offset + 20 * 16, input);                                       \
+  in51 = load(offset + 21 * 16, input);                                       \
+  in52 = load(offset + 22 * 16, input);                                       \
+  in53 = load(offset + 23 * 16, input);                                       \
+                                                                              \
+  in60 = load(offset + 24 * 16, input);                                       \
+  in61 = load(offset + 25 * 16, input);                                       \
+  in62 = load(offset + 26 * 16, input);                                       \
+  in63 = load(offset + 27 * 16, input);                                       \
+                                                                              \
+  /* load the last row from the 8x32 block*/                                  \
+  in70 = load(offset + 28 * 16, input);                                       \
+  in71 = load(offset + 29 * 16, input);                                       \
+  in72 = load(offset + 30 * 16, input);                                       \
+  in73 = load(offset + 31 * 16, input);
+
+/* for the: temp1 = -step[x] * cospi_q + step[y] * cospi_z
+ *          temp2 = step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1)              \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                     \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                     \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt0 = vec_packs(temp10, temp11);                                     \
+  temp10 = vec_add(vec_mule(tmp16_0, cospi1), vec_mulo(tmp16_0, cospi0)); \
+  temp11 = vec_add(vec_mule(tmp16_1, cospi1), vec_mulo(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                          \
+  DCT_CONST_ROUND_SHIFT(temp11);                                          \
+  outpt1 = vec_packs(temp10, temp11);
+
+/* for the: temp1 = -step[x] * cospi_q - step[y] * cospi_z
+ *          temp2 = -step[x] * cospi_z + step[y] * cospi_q */
+#define STEP32_1(inpt0, inpt1, outpt0, outpt1, cospi0, cospi1, cospi1m)    \
+  tmp16_0 = vec_mergeh(inpt0, inpt1);                                      \
+  tmp16_1 = vec_mergel(inpt0, inpt1);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi1m), vec_mule(tmp16_0, cospi0)); \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi1m), vec_mule(tmp16_1, cospi0)); \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt0 = vec_packs(temp10, temp11);                                      \
+  temp10 = vec_sub(vec_mulo(tmp16_0, cospi0), vec_mule(tmp16_0, cospi1));  \
+  temp11 = vec_sub(vec_mulo(tmp16_1, cospi0), vec_mule(tmp16_1, cospi1));  \
+  DCT_CONST_ROUND_SHIFT(temp10);                                           \
+  DCT_CONST_ROUND_SHIFT(temp11);                                           \
+  outpt1 = vec_packs(temp10, temp11);
+
+#define IDCT32(in0, in1, in2, in3, out)                                \
+                                                                       \
+  /* stage 1 */                                                        \
+  /* out[0][0] = in[0][0]; */                                          \
+  out[0][1] = in2[0];                                                  \
+  out[0][2] = in1[0];                                                  \
+  out[0][3] = in3[0];                                                  \
+  out[0][4] = in0[4];                                                  \
+  out[0][5] = in2[4];                                                  \
+  out[0][6] = in1[4];                                                  \
+  out[0][7] = in3[4];                                                  \
+  out[1][0] = in0[2];                                                  \
+  out[1][1] = in2[2];                                                  \
+  out[1][2] = in1[2];                                                  \
+  out[1][3] = in3[2];                                                  \
+  out[1][4] = in0[6];                                                  \
+  out[1][5] = in2[6];                                                  \
+  out[1][6] = in1[6];                                                  \
+  out[1][7] = in3[6];                                                  \
+                                                                       \
+  STEP8_0(in0[1], in3[7], out[2][0], out[3][7], cospi31_v, cospi1_v);  \
+  STEP8_0(in2[1], in1[7], out[2][1], out[3][6], cospi15_v, cospi17_v); \
+  STEP8_0(in1[1], in2[7], out[2][2], out[3][5], cospi23_v, cospi9_v);  \
+  STEP8_0(in3[1], in0[7], out[2][3], out[3][4], cospi7_v, cospi25_v);  \
+  STEP8_0(in0[5], in3[3], out[2][4], out[3][3], cospi27_v, cospi5_v);  \
+  STEP8_0(in2[5], in1[3], out[2][5], out[3][2], cospi11_v, cospi21_v); \
+  STEP8_0(in1[5], in2[3], out[2][6], out[3][1], cospi19_v, cospi13_v); \
+  STEP8_0(in3[5], in0[3], out[2][7], out[3][0], cospi3_v, cospi29_v);  \
+                                                                       \
+  /* stage 2 */                                                        \
+  /* in0[0] = out[0][0]; */                                            \
+  in0[1] = out[0][1];                                                  \
+  in0[2] = out[0][2];                                                  \
+  in0[3] = out[0][3];                                                  \
+  in0[4] = out[0][4];                                                  \
+  in0[5] = out[0][5];                                                  \
+  in0[6] = out[0][6];                                                  \
+  in0[7] = out[0][7];                                                  \
+                                                                       \
+  STEP8_0(out[1][0], out[1][7], in1[0], in1[7], cospi30_v, cospi2_v);  \
+  STEP8_0(out[1][1], out[1][6], in1[1], in1[6], cospi14_v, cospi18_v); \
+  STEP8_0(out[1][2], out[1][5], in1[2], in1[5], cospi22_v, cospi10_v); \
+  STEP8_0(out[1][3], out[1][4], in1[3], in1[4], cospi6_v, cospi26_v);  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][1]);                              \
+  in2[1] = vec_sub(out[2][0], out[2][1]);                              \
+  in2[2] = vec_sub(out[2][3], out[2][2]);                              \
+  in2[3] = vec_add(out[2][3], out[2][2]);                              \
+  in2[4] = vec_add(out[2][4], out[2][5]);                              \
+  in2[5] = vec_sub(out[2][4], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][7], out[2][6]);                              \
+  in2[7] = vec_add(out[2][7], out[2][6]);                              \
+  in3[0] = vec_add(out[3][0], out[3][1]);                              \
+  in3[1] = vec_sub(out[3][0], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][3], out[3][2]);                              \
+  in3[3] = vec_add(out[3][3], out[3][2]);                              \
+  in3[4] = vec_add(out[3][4], out[3][5]);                              \
+  in3[5] = vec_sub(out[3][4], out[3][5]);                              \
+  in3[6] = vec_sub(out[3][7], out[3][6]);                              \
+  in3[7] = vec_add(out[3][6], out[3][7]);                              \
+                                                                       \
+  /* stage 3 */                                                        \
+  out[0][0] = in0[0];                                                  \
+  out[0][1] = in0[1];                                                  \
+  out[0][2] = in0[2];                                                  \
+  out[0][3] = in0[3];                                                  \
+                                                                       \
+  STEP8_0(in0[4], in0[7], out[0][4], out[0][7], cospi28_v, cospi4_v);  \
+  STEP8_0(in0[5], in0[6], out[0][5], out[0][6], cospi12_v, cospi20_v); \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[1]);                                 \
+  out[1][1] = vec_sub(in1[0], in1[1]);                                 \
+  out[1][2] = vec_sub(in1[3], in1[2]);                                 \
+  out[1][3] = vec_add(in1[2], in1[3]);                                 \
+  out[1][4] = vec_add(in1[4], in1[5]);                                 \
+  out[1][5] = vec_sub(in1[4], in1[5]);                                 \
+  out[1][6] = vec_sub(in1[7], in1[6]);                                 \
+  out[1][7] = vec_add(in1[6], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[3][7] = in3[7];                                                  \
+  STEP32(in2[1], in3[6], out[2][1], out[3][6], cospi4_v, cospi28_v);   \
+  STEP32_1(in2[2], in3[5], out[2][2], out[3][5], cospi28_v, cospi4_v,  \
+           cospi4m_v);                                                 \
+  out[2][3] = in2[3];                                                  \
+  out[2][4] = in2[4];                                                  \
+  STEP32(in2[5], in3[2], out[2][5], out[3][2], cospi20_v, cospi12_v);  \
+  STEP32_1(in2[6], in3[1], out[2][6], out[3][1], cospi12_v, cospi20_v, \
+           cospi20m_v);                                                \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][3] = in3[3];                                                  \
+  out[3][4] = in3[4];                                                  \
+                                                                       \
+  /* stage 4 */                                                        \
+  STEP16_1(out[0][0], out[0][1], in0[1], in0[0], cospi16_v);           \
+  STEP8_0(out[0][2], out[0][3], in0[2], in0[3], cospi24_v, cospi8_v);  \
+  in0[4] = vec_add(out[0][4], out[0][5]);                              \
+  in0[5] = vec_sub(out[0][4], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][7], out[0][6]);                              \
+  in0[7] = vec_add(out[0][7], out[0][6]);                              \
+                                                                       \
+  in1[0] = out[1][0];                                                  \
+  in1[7] = out[1][7];                                                  \
+  STEP32(out[1][1], out[1][6], in1[1], in1[6], cospi8_v, cospi24_v);   \
+  STEP32_1(out[1][2], out[1][5], in1[2], in1[5], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  in1[3] = out[1][3];                                                  \
+  in1[4] = out[1][4];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][3]);                              \
+  in2[1] = vec_add(out[2][1], out[2][2]);                              \
+  in2[2] = vec_sub(out[2][1], out[2][2]);                              \
+  in2[3] = vec_sub(out[2][0], out[2][3]);                              \
+  in2[4] = vec_sub(out[2][7], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][6], out[2][5]);                              \
+  in2[6] = vec_add(out[2][5], out[2][6]);                              \
+  in2[7] = vec_add(out[2][4], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_add(out[3][0], out[3][3]);                              \
+  in3[1] = vec_add(out[3][1], out[3][2]);                              \
+  in3[2] = vec_sub(out[3][1], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][0], out[3][3]);                              \
+  in3[4] = vec_sub(out[3][7], out[3][4]);                              \
+  in3[5] = vec_sub(out[3][6], out[3][5]);                              \
+  in3[6] = vec_add(out[3][5], out[3][6]);                              \
+  in3[7] = vec_add(out[3][4], out[3][7]);                              \
+                                                                       \
+  /* stage 5 */                                                        \
+  out[0][0] = vec_add(in0[0], in0[3]);                                 \
+  out[0][1] = vec_add(in0[1], in0[2]);                                 \
+  out[0][2] = vec_sub(in0[1], in0[2]);                                 \
+  out[0][3] = vec_sub(in0[0], in0[3]);                                 \
+  out[0][4] = in0[4];                                                  \
+  STEP16_1(in0[6], in0[5], out[0][5], out[0][6], cospi16_v);           \
+  out[0][7] = in0[7];                                                  \
+                                                                       \
+  out[1][0] = vec_add(in1[0], in1[3]);                                 \
+  out[1][1] = vec_add(in1[1], in1[2]);                                 \
+  out[1][2] = vec_sub(in1[1], in1[2]);                                 \
+  out[1][3] = vec_sub(in1[0], in1[3]);                                 \
+  out[1][4] = vec_sub(in1[7], in1[4]);                                 \
+  out[1][5] = vec_sub(in1[6], in1[5]);                                 \
+  out[1][6] = vec_add(in1[5], in1[6]);                                 \
+  out[1][7] = vec_add(in1[4], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  STEP32(in2[2], in3[5], out[2][2], out[3][5], cospi8_v, cospi24_v);   \
+  STEP32(in2[3], in3[4], out[2][3], out[3][4], cospi8_v, cospi24_v);   \
+  STEP32_1(in2[4], in3[3], out[2][4], out[3][3], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  STEP32_1(in2[5], in3[2], out[2][5], out[3][2], cospi24_v, cospi8_v,  \
+           cospi8m_v);                                                 \
+  out[2][6] = in2[6];                                                  \
+  out[2][7] = in2[7];                                                  \
+  out[3][0] = in3[0];                                                  \
+  out[3][1] = in3[1];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* stage 6 */                                                        \
+  in0[0] = vec_add(out[0][0], out[0][7]);                              \
+  in0[1] = vec_add(out[0][1], out[0][6]);                              \
+  in0[2] = vec_add(out[0][2], out[0][5]);                              \
+  in0[3] = vec_add(out[0][3], out[0][4]);                              \
+  in0[4] = vec_sub(out[0][3], out[0][4]);                              \
+  in0[5] = vec_sub(out[0][2], out[0][5]);                              \
+  in0[6] = vec_sub(out[0][1], out[0][6]);                              \
+  in0[7] = vec_sub(out[0][0], out[0][7]);                              \
+  in1[0] = out[1][0];                                                  \
+  in1[1] = out[1][1];                                                  \
+  STEP16_1(out[1][5], out[1][2], in1[2], in1[5], cospi16_v);           \
+  STEP16_1(out[1][4], out[1][3], in1[3], in1[4], cospi16_v);           \
+  in1[6] = out[1][6];                                                  \
+  in1[7] = out[1][7];                                                  \
+                                                                       \
+  in2[0] = vec_add(out[2][0], out[2][7]);                              \
+  in2[1] = vec_add(out[2][1], out[2][6]);                              \
+  in2[2] = vec_add(out[2][2], out[2][5]);                              \
+  in2[3] = vec_add(out[2][3], out[2][4]);                              \
+  in2[4] = vec_sub(out[2][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[2][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[2][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[2][0], out[2][7]);                              \
+                                                                       \
+  in3[0] = vec_sub(out[3][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[3][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[3][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[3][4], out[3][3]);                              \
+  in3[4] = vec_add(out[3][4], out[3][3]);                              \
+  in3[5] = vec_add(out[3][5], out[3][2]);                              \
+  in3[6] = vec_add(out[3][6], out[3][1]);                              \
+  in3[7] = vec_add(out[3][7], out[3][0]);                              \
+                                                                       \
+  /* stage 7 */                                                        \
+  out[0][0] = vec_add(in0[0], in1[7]);                                 \
+  out[0][1] = vec_add(in0[1], in1[6]);                                 \
+  out[0][2] = vec_add(in0[2], in1[5]);                                 \
+  out[0][3] = vec_add(in0[3], in1[4]);                                 \
+  out[0][4] = vec_add(in0[4], in1[3]);                                 \
+  out[0][5] = vec_add(in0[5], in1[2]);                                 \
+  out[0][6] = vec_add(in0[6], in1[1]);                                 \
+  out[0][7] = vec_add(in0[7], in1[0]);                                 \
+  out[1][0] = vec_sub(in0[7], in1[0]);                                 \
+  out[1][1] = vec_sub(in0[6], in1[1]);                                 \
+  out[1][2] = vec_sub(in0[5], in1[2]);                                 \
+  out[1][3] = vec_sub(in0[4], in1[3]);                                 \
+  out[1][4] = vec_sub(in0[3], in1[4]);                                 \
+  out[1][5] = vec_sub(in0[2], in1[5]);                                 \
+  out[1][6] = vec_sub(in0[1], in1[6]);                                 \
+  out[1][7] = vec_sub(in0[0], in1[7]);                                 \
+                                                                       \
+  out[2][0] = in2[0];                                                  \
+  out[2][1] = in2[1];                                                  \
+  out[2][2] = in2[2];                                                  \
+  out[2][3] = in2[3];                                                  \
+  STEP16_1(in3[3], in2[4], out[2][4], out[3][3], cospi16_v);           \
+  STEP16_1(in3[2], in2[5], out[2][5], out[3][2], cospi16_v);           \
+  STEP16_1(in3[1], in2[6], out[2][6], out[3][1], cospi16_v);           \
+  STEP16_1(in3[0], in2[7], out[2][7], out[3][0], cospi16_v);           \
+  out[3][4] = in3[4];                                                  \
+  out[3][5] = in3[5];                                                  \
+  out[3][6] = in3[6];                                                  \
+  out[3][7] = in3[7];                                                  \
+                                                                       \
+  /* final */                                                          \
+  in0[0] = vec_add(out[0][0], out[3][7]);                              \
+  in0[1] = vec_add(out[0][1], out[3][6]);                              \
+  in0[2] = vec_add(out[0][2], out[3][5]);                              \
+  in0[3] = vec_add(out[0][3], out[3][4]);                              \
+  in0[4] = vec_add(out[0][4], out[3][3]);                              \
+  in0[5] = vec_add(out[0][5], out[3][2]);                              \
+  in0[6] = vec_add(out[0][6], out[3][1]);                              \
+  in0[7] = vec_add(out[0][7], out[3][0]);                              \
+  in1[0] = vec_add(out[1][0], out[2][7]);                              \
+  in1[1] = vec_add(out[1][1], out[2][6]);                              \
+  in1[2] = vec_add(out[1][2], out[2][5]);                              \
+  in1[3] = vec_add(out[1][3], out[2][4]);                              \
+  in1[4] = vec_add(out[1][4], out[2][3]);                              \
+  in1[5] = vec_add(out[1][5], out[2][2]);                              \
+  in1[6] = vec_add(out[1][6], out[2][1]);                              \
+  in1[7] = vec_add(out[1][7], out[2][0]);                              \
+  in2[0] = vec_sub(out[1][7], out[2][0]);                              \
+  in2[1] = vec_sub(out[1][6], out[2][1]);                              \
+  in2[2] = vec_sub(out[1][5], out[2][2]);                              \
+  in2[3] = vec_sub(out[1][4], out[2][3]);                              \
+  in2[4] = vec_sub(out[1][3], out[2][4]);                              \
+  in2[5] = vec_sub(out[1][2], out[2][5]);                              \
+  in2[6] = vec_sub(out[1][1], out[2][6]);                              \
+  in2[7] = vec_sub(out[1][0], out[2][7]);                              \
+  in3[0] = vec_sub(out[0][7], out[3][0]);                              \
+  in3[1] = vec_sub(out[0][6], out[3][1]);                              \
+  in3[2] = vec_sub(out[0][5], out[3][2]);                              \
+  in3[3] = vec_sub(out[0][4], out[3][3]);                              \
+  in3[4] = vec_sub(out[0][3], out[3][4]);                              \
+  in3[5] = vec_sub(out[0][2], out[3][5]);                              \
+  in3[6] = vec_sub(out[0][1], out[3][6]);                              \
+  in3[7] = vec_sub(out[0][0], out[3][7]);
+
+// NOT A FULL TRANSPOSE! Transposes just each 8x8 block in each row,
+// does not transpose rows
+#define TRANSPOSE_8x32(in, out)                                                \
+  /* transpose 4 of 8x8 blocks */                                              \
+  TRANSPOSE8x8(in[0][0], in[0][1], in[0][2], in[0][3], in[0][4], in[0][5],     \
+               in[0][6], in[0][7], out[0][0], out[0][1], out[0][2], out[0][3], \
+               out[0][4], out[0][5], out[0][6], out[0][7]);                    \
+  TRANSPOSE8x8(in[1][0], in[1][1], in[1][2], in[1][3], in[1][4], in[1][5],     \
+               in[1][6], in[1][7], out[1][0], out[1][1], out[1][2], out[1][3], \
+               out[1][4], out[1][5], out[1][6], out[1][7]);                    \
+  TRANSPOSE8x8(in[2][0], in[2][1], in[2][2], in[2][3], in[2][4], in[2][5],     \
+               in[2][6], in[2][7], out[2][0], out[2][1], out[2][2], out[2][3], \
+               out[2][4], out[2][5], out[2][6], out[2][7]);                    \
+  TRANSPOSE8x8(in[3][0], in[3][1], in[3][2], in[3][3], in[3][4], in[3][5],     \
+               in[3][6], in[3][7], out[3][0], out[3][1], out[3][2], out[3][3], \
+               out[3][4], out[3][5], out[3][6], out[3][7]);
+
+#define PIXEL_ADD_STORE32(in0, in1, in2, in3, step)        \
+  dst = vec_vsx_ld((step)*stride, dest);                   \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
+  PIXEL_ADD(in0, d_uh, add, shift6);                       \
+  PIXEL_ADD(in1, d_ul, add, shift6);                       \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride, dest); \
+  dst = vec_vsx_ld((step)*stride + 16, dest);              \
+  d_uh = (int16x8_t)vec_mergeh(dst, zerov);                \
+  d_ul = (int16x8_t)vec_mergel(dst, zerov);                \
+  PIXEL_ADD(in2, d_uh, add, shift6);                       \
+  PIXEL_ADD(in3, d_ul, add, shift6);                       \
+  vec_vsx_st(vec_packsu(d_uh, d_ul), (step)*stride + 16, dest);
+
+#define ADD_STORE_BLOCK(in, offset)                                      \
+  PIXEL_ADD_STORE32(in[0][0], in[1][0], in[2][0], in[3][0], offset + 0); \
+  PIXEL_ADD_STORE32(in[0][1], in[1][1], in[2][1], in[3][1], offset + 1); \
+  PIXEL_ADD_STORE32(in[0][2], in[1][2], in[2][2], in[3][2], offset + 2); \
+  PIXEL_ADD_STORE32(in[0][3], in[1][3], in[2][3], in[3][3], offset + 3); \
+  PIXEL_ADD_STORE32(in[0][4], in[1][4], in[2][4], in[3][4], offset + 4); \
+  PIXEL_ADD_STORE32(in[0][5], in[1][5], in[2][5], in[3][5], offset + 5); \
+  PIXEL_ADD_STORE32(in[0][6], in[1][6], in[2][6], in[3][6], offset + 6); \
+  PIXEL_ADD_STORE32(in[0][7], in[1][7], in[2][7], in[3][7], offset + 7);
+
+void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int16x8_t src0[4][8], src1[4][8], src2[4][8], src3[4][8], tmp[4][8];
+  int16x8_t tmp16_0, tmp16_1;
+  int32x4_t temp10, temp11, temp20, temp21, temp30;
+  uint8x16_t dst;
+  int16x8_t d_uh, d_ul;
+  int16x8_t add = vec_sl(vec_splat_s16(8), vec_splat_u16(2));
+  uint16x8_t shift6 = vec_splat_u16(6);
+  uint8x16_t zerov = vec_splat_u8(0);
+
+  ROUND_SHIFT_INIT;
+
+  LOAD_8x32(load_tran_low, src0[0][0], src0[1][0], src0[2][0], src0[3][0],
+            src0[0][1], src0[1][1], src0[2][1], src0[3][1], src0[0][2],
+            src0[1][2], src0[2][2], src0[3][2], src0[0][3], src0[1][3],
+            src0[2][3], src0[3][3], src0[0][4], src0[1][4], src0[2][4],
+            src0[3][4], src0[0][5], src0[1][5], src0[2][5], src0[3][5],
+            src0[0][6], src0[1][6], src0[2][6], src0[3][6], src0[0][7],
+            src0[1][7], src0[2][7], src0[3][7], 0);
+  // Rows
+  // transpose the first row of 8x8 blocks
+  TRANSPOSE_8x32(src0, tmp);
+  // transform the 32x8 column
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src0);
+  TRANSPOSE_8x32(tmp, src0);
+
+  LOAD_8x32(load_tran_low, src1[0][0], src1[1][0], src1[2][0], src1[3][0],
+            src1[0][1], src1[1][1], src1[2][1], src1[3][1], src1[0][2],
+            src1[1][2], src1[2][2], src1[3][2], src1[0][3], src1[1][3],
+            src1[2][3], src1[3][3], src1[0][4], src1[1][4], src1[2][4],
+            src1[3][4], src1[0][5], src1[1][5], src1[2][5], src1[3][5],
+            src1[0][6], src1[1][6], src1[2][6], src1[3][6], src1[0][7],
+            src1[1][7], src1[2][7], src1[3][7], 512);
+  TRANSPOSE_8x32(src1, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src1);
+  TRANSPOSE_8x32(tmp, src1);
+
+  LOAD_8x32(load_tran_low, src2[0][0], src2[1][0], src2[2][0], src2[3][0],
+            src2[0][1], src2[1][1], src2[2][1], src2[3][1], src2[0][2],
+            src2[1][2], src2[2][2], src2[3][2], src2[0][3], src2[1][3],
+            src2[2][3], src2[3][3], src2[0][4], src2[1][4], src2[2][4],
+            src2[3][4], src2[0][5], src2[1][5], src2[2][5], src2[3][5],
+            src2[0][6], src2[1][6], src2[2][6], src2[3][6], src2[0][7],
+            src2[1][7], src2[2][7], src2[3][7], 1024);
+  TRANSPOSE_8x32(src2, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src2);
+  TRANSPOSE_8x32(tmp, src2);
+
+  LOAD_8x32(load_tran_low, src3[0][0], src3[1][0], src3[2][0], src3[3][0],
+            src3[0][1], src3[1][1], src3[2][1], src3[3][1], src3[0][2],
+            src3[1][2], src3[2][2], src3[3][2], src3[0][3], src3[1][3],
+            src3[2][3], src3[3][3], src3[0][4], src3[1][4], src3[2][4],
+            src3[3][4], src3[0][5], src3[1][5], src3[2][5], src3[3][5],
+            src3[0][6], src3[1][6], src3[2][6], src3[3][6], src3[0][7],
+            src3[1][7], src3[2][7], src3[3][7], 1536);
+  TRANSPOSE_8x32(src3, tmp);
+  IDCT32(tmp[0], tmp[1], tmp[2], tmp[3], src3);
+  TRANSPOSE_8x32(tmp, src3);
+
+  // Columns
+  IDCT32(src0[0], src1[0], src2[0], src3[0], tmp);
+  IDCT32(src0[1], src1[1], src2[1], src3[1], tmp);
+  IDCT32(src0[2], src1[2], src2[2], src3[2], tmp);
+  IDCT32(src0[3], src1[3], src2[3], src3[3], tmp);
+
+  ADD_STORE_BLOCK(src0, 0);
+  ADD_STORE_BLOCK(src1, 8);
+  ADD_STORE_BLOCK(src2, 16);
+  ADD_STORE_BLOCK(src3, 24);
+}
diff --git a/libvpx/vpx_dsp/ppc/sad_vsx.c b/libvpx/vpx_dsp/ppc/sad_vsx.c
index 3edb40c31..bb49addae 100644
--- a/libvpx/vpx_dsp/ppc/sad_vsx.c
+++ b/libvpx/vpx_dsp/ppc/sad_vsx.c
@@ -10,9 +10,12 @@
 
 #include <stdlib.h>
 
+#include "./vpx_dsp_rtcd.h"
+
 #include "vpx_dsp/ppc/types_vsx.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 #define PROCESS16(offset)           \
   v_a = vec_vsx_ld(offset, a);      \
@@ -100,3 +103,152 @@ SAD32(32);
 SAD32(64);
 SAD64(32);
 SAD64(64);
+
+#define SAD16AVG(height)                                                      \
+  unsigned int vpx_sad16x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[16 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 16, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad16x##height##_vsx(src, src_stride, comp_pred, 16);          \
+  }
+
+#define SAD32AVG(height)                                                      \
+  unsigned int vpx_sad32x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[32 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 32, height, ref,            \
+                          ref_stride);                                        \
+                                                                              \
+    return vpx_sad32x##height##_vsx(src, src_stride, comp_pred, 32);          \
+  }
+
+#define SAD64AVG(height)                                                      \
+  unsigned int vpx_sad64x##height##_avg_vsx(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    DECLARE_ALIGNED(64, uint8_t, comp_pred[64 * height]);                     \
+    vpx_comp_avg_pred_vsx(comp_pred, second_pred, 64, height, ref,            \
+                          ref_stride);                                        \
+    return vpx_sad64x##height##_vsx(src, src_stride, comp_pred, 64);          \
+  }
+
+SAD16AVG(8);
+SAD16AVG(16);
+SAD16AVG(32);
+SAD32AVG(16);
+SAD32AVG(32);
+SAD32AVG(64);
+SAD64AVG(32);
+SAD64AVG(64);
+
+#define PROCESS16_4D(offset, ref, v_h, v_l) \
+  v_b = vec_vsx_ld(offset, ref);            \
+  v_bh = unpack_to_s16_h(v_b);              \
+  v_bl = unpack_to_s16_l(v_b);              \
+  v_subh = vec_sub(v_h, v_bh);              \
+  v_subl = vec_sub(v_l, v_bl);              \
+  v_absh = vec_abs(v_subh);                 \
+  v_absl = vec_abs(v_subl);                 \
+  v_sad = vec_sum4s(v_absh, v_sad);         \
+  v_sad = vec_sum4s(v_absl, v_sad);
+
+#define UNPACK_SRC(offset, srcv_h, srcv_l) \
+  v_a = vec_vsx_ld(offset, src);           \
+  srcv_h = unpack_to_s16_h(v_a);           \
+  srcv_l = unpack_to_s16_l(v_a);
+
+#define SAD16_4D(height)                                                  \
+  void vpx_sad16x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl;     \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah, v_al);                              \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah, v_al);            \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD32_4D(height)                                                  \
+  void vpx_sad32x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+#define SAD64_4D(height)                                                  \
+  void vpx_sad64x##height##x4d_vsx(const uint8_t *src, int src_stride,    \
+                                   const uint8_t *const ref_array[],      \
+                                   int ref_stride, uint32_t *sad_array) { \
+    int i;                                                                \
+    int y;                                                                \
+    unsigned int sad[4];                                                  \
+    uint8x16_t v_a, v_b;                                                  \
+    int16x8_t v_ah1, v_al1, v_ah2, v_al2, v_bh, v_bl;                     \
+    int16x8_t v_ah3, v_al3, v_ah4, v_al4;                                 \
+    int16x8_t v_absh, v_absl, v_subh, v_subl;                             \
+                                                                          \
+    for (i = 0; i < 4; i++) sad_array[i] = 0;                             \
+                                                                          \
+    for (y = 0; y < height; y++) {                                        \
+      UNPACK_SRC(y *src_stride, v_ah1, v_al1);                            \
+      UNPACK_SRC(y *src_stride + 16, v_ah2, v_al2);                       \
+      UNPACK_SRC(y *src_stride + 32, v_ah3, v_al3);                       \
+      UNPACK_SRC(y *src_stride + 48, v_ah4, v_al4);                       \
+      for (i = 0; i < 4; i++) {                                           \
+        int32x4_t v_sad = vec_splat_s32(0);                               \
+        PROCESS16_4D(y *ref_stride, ref_array[i], v_ah1, v_al1);          \
+        PROCESS16_4D(y *ref_stride + 16, ref_array[i], v_ah2, v_al2);     \
+        PROCESS16_4D(y *ref_stride + 32, ref_array[i], v_ah3, v_al3);     \
+        PROCESS16_4D(y *ref_stride + 48, ref_array[i], v_ah4, v_al4);     \
+                                                                          \
+        vec_vsx_st((uint32x4_t)v_sad, 0, sad);                            \
+        sad_array[i] += (sad[3] + sad[2] + sad[1] + sad[0]);              \
+      }                                                                   \
+    }                                                                     \
+  }
+
+SAD16_4D(8);
+SAD16_4D(16);
+SAD16_4D(32);
+SAD32_4D(16);
+SAD32_4D(32);
+SAD32_4D(64);
+SAD64_4D(32);
+SAD64_4D(64);
diff --git a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
index 55dcdc2ba..5c3ba4576 100644
--- a/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
+++ b/libvpx/vpx_dsp/ppc/vpx_convolve_vsx.c
@@ -53,13 +53,13 @@ static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int32_t filter_x_stride,
-                           const int16_t *filter_y, int32_t filter_y_stride,
-                           int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   switch (w) {
     case 16: {
@@ -132,14 +132,8 @@ static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride,
 
 void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int32_t filter_x_stride,
-                          const int16_t *filter_y, int32_t filter_y_stride,
-                          int32_t w, int32_t h) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
-
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) {
   switch (w) {
     case 16: {
       avg_w16(src, src_stride, dst, dst_stride, h);
@@ -154,8 +148,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
       break;
     }
     default: {
-      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x,
-                         filter_x_stride, filter_y, filter_y_stride, w, h);
+      vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                         x_step_q4, y0_q4, y_step_q4, w, h);
       break;
     }
   }
@@ -299,9 +293,9 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
 
 static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h) {
+                            const InterpKernel *const filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -324,95 +318,77 @@ static inline void convolve(const uint8_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 <= 32);
 
   convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
                              int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
 
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
 }
 
 void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride,
                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
 
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
 }
 
 void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
 }
 
 void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride,
                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
 }
 
 void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+  convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4,
+           y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                     y_step_q4, w, h);
-  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
diff --git a/libvpx/vpx_dsp/quantize.c b/libvpx/vpx_dsp/quantize.c
index 3c7f9832f..e37ca92ad 100644
--- a/libvpx/vpx_dsp/quantize.c
+++ b/libvpx/vpx_dsp/quantize.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
@@ -123,40 +125,40 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-        tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
-               quant_shift_ptr[rc != 0]) >>
-              16;  // quantization
-        qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-
-        if (tmp) eob = i;
-      }
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (abs_coeff >= zbins[rc != 0]) {
+      int tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+      tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+             quant_shift_ptr[rc != 0]) >>
+            16;  // quantization
+      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+
+      if (tmp) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -174,39 +176,38 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = (int)n_coeffs - 1; i >= 0; i--) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
-        non_zero_count--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+
+    if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+      non_zero_count--;
+    else
+      break;
+  }
 
-    // Quantization pass: All coefficients with index >= zero_flag are
-    // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < non_zero_count; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-
-      if (abs_coeff >= zbins[rc != 0]) {
-        const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
-        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
-        qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
-        if (abs_qcoeff) eob = i;
-      }
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+
+    if (abs_coeff >= zbins[rc != 0]) {
+      const int64_t tmp1 = abs_coeff + round_ptr[rc != 0];
+      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 16);
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      if (abs_qcoeff) eob = i;
     }
   }
   *eob_ptr = eob + 1;
@@ -228,41 +229,40 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      int tmp;
-      int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
-      tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
-             quant_shift_ptr[rc != 0]) >>
-            15;
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
 
-      qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    int tmp;
+    int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    abs_coeff += ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    abs_coeff = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+    tmp = ((((abs_coeff * quant_ptr[rc != 0]) >> 16) + abs_coeff) *
+           quant_shift_ptr[rc != 0]) >>
+          15;
 
-      if (tmp) eob = idx_arr[i];
-    }
+    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+
+    if (tmp) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
@@ -282,38 +282,35 @@ void vpx_highbd_quantize_b_32x32_c(
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs; i++) {
-      const int rc = scan[i];
-      const int coeff = coeff_ptr[rc];
-
-      // If the coefficient is out of the base ZBIN range, keep it for
-      // quantization.
-      if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0])
-        idx_arr[idx++] = i;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = scan[idx_arr[i]];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = idx_arr[i];
-    }
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = idx_arr[i];
   }
   *eob_ptr = eob + 1;
 }
diff --git a/libvpx/vpx_dsp/sad.c b/libvpx/vpx_dsp/sad.c
index 6ceb37e43..18b6dc6e0 100644
--- a/libvpx/vpx_dsp/sad.c
+++ b/libvpx/vpx_dsp/sad.c
@@ -70,8 +70,6 @@ static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
 /* clang-format off */
 // 64x64
 sadMxN(64, 64)
-sadMxNxK(64, 64, 3)
-sadMxNxK(64, 64, 8)
 sadMxNx4D(64, 64)
 
 // 64x32
@@ -84,8 +82,6 @@ sadMxNx4D(32, 64)
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 3)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -122,12 +118,10 @@ sadMxNx4D(8, 8)
 
 // 8x4
 sadMxN(8, 4)
-sadMxNxK(8, 4, 8)
 sadMxNx4D(8, 4)
 
 // 4x8
 sadMxN(4, 8)
-sadMxNxK(4, 8, 8)
 sadMxNx4D(4, 8)
 
 // 4x4
@@ -183,17 +177,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
     return highbd_sadb(src, src_stride, comp_pred, m, m, n);                   \
   }
 
-#define highbd_sadMxNxK(m, n, k)                                             \
-  void vpx_highbd_sad##m##x##n##x##k##_c(                                    \
-      const uint8_t *src, int src_stride, const uint8_t *ref_array,          \
-      int ref_stride, uint32_t *sad_array) {                                 \
-    int i;                                                                   \
-    for (i = 0; i < k; ++i) {                                                \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src, src_stride,            \
-                                                 &ref_array[i], ref_stride); \
-    }                                                                        \
-  }
-
 #define highbd_sadMxNx4D(m, n)                                               \
   void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,    \
                                       const uint8_t *const ref_array[],      \
@@ -208,8 +191,6 @@ static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
 /* clang-format off */
 // 64x64
 highbd_sadMxN(64, 64)
-highbd_sadMxNxK(64, 64, 3)
-highbd_sadMxNxK(64, 64, 8)
 highbd_sadMxNx4D(64, 64)
 
 // 64x32
@@ -222,8 +203,6 @@ highbd_sadMxNx4D(32, 64)
 
 // 32x32
 highbd_sadMxN(32, 32)
-highbd_sadMxNxK(32, 32, 3)
-highbd_sadMxNxK(32, 32, 8)
 highbd_sadMxNx4D(32, 32)
 
 // 32x16
@@ -236,42 +215,30 @@ highbd_sadMxNx4D(16, 32)
 
 // 16x16
 highbd_sadMxN(16, 16)
-highbd_sadMxNxK(16, 16, 3)
-highbd_sadMxNxK(16, 16, 8)
 highbd_sadMxNx4D(16, 16)
 
 // 16x8
 highbd_sadMxN(16, 8)
-highbd_sadMxNxK(16, 8, 3)
-highbd_sadMxNxK(16, 8, 8)
 highbd_sadMxNx4D(16, 8)
 
 // 8x16
 highbd_sadMxN(8, 16)
-highbd_sadMxNxK(8, 16, 3)
-highbd_sadMxNxK(8, 16, 8)
 highbd_sadMxNx4D(8, 16)
 
 // 8x8
 highbd_sadMxN(8, 8)
-highbd_sadMxNxK(8, 8, 3)
-highbd_sadMxNxK(8, 8, 8)
 highbd_sadMxNx4D(8, 8)
 
 // 8x4
 highbd_sadMxN(8, 4)
-highbd_sadMxNxK(8, 4, 8)
 highbd_sadMxNx4D(8, 4)
 
 // 4x8
 highbd_sadMxN(4, 8)
-highbd_sadMxNxK(4, 8, 8)
 highbd_sadMxNx4D(4, 8)
 
 // 4x4
 highbd_sadMxN(4, 4)
-highbd_sadMxNxK(4, 4, 3)
-highbd_sadMxNxK(4, 4, 8)
 highbd_sadMxNx4D(4, 4)
 /* clang-format on */
 
diff --git a/libvpx/vpx_dsp/skin_detection.c b/libvpx/vpx_dsp/skin_detection.c
new file mode 100644
index 000000000..bbbb6c3a1
--- /dev/null
+++ b/libvpx/vpx_dsp/skin_detection.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+
+#define MODEL_MODE 1
+
+// Fixed-point skin color model parameters.
+static const int skin_mean[5][2] = { { 7463, 9614 },
+                                     { 6400, 10240 },
+                                     { 7040, 10240 },
+                                     { 8320, 9280 },
+                                     { 6800, 9614 } };
+static const int skin_inv_cov[4] = { 4107, 1663, 1663, 2157 };  // q16
+static const int skin_threshold[6] = { 1570636, 1400000, 800000,
+                                       800000,  800000,  800000 };  // q18
+// Thresholds on luminance.
+static const int y_low = 40;
+static const int y_high = 220;
+
+// Evaluates the Mahalanobis distance measure for the input CbCr values.
+static int vpx_evaluate_skin_color_difference(const int cb, const int cr,
+                                              const int idx) {
+  const int cb_q6 = cb << 6;
+  const int cr_q6 = cr << 6;
+  const int cb_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cb_q6 - skin_mean[idx][0]);
+  const int cbcr_diff_q12 =
+      (cb_q6 - skin_mean[idx][0]) * (cr_q6 - skin_mean[idx][1]);
+  const int cr_diff_q12 =
+      (cr_q6 - skin_mean[idx][1]) * (cr_q6 - skin_mean[idx][1]);
+  const int cb_diff_q2 = (cb_diff_q12 + (1 << 9)) >> 10;
+  const int cbcr_diff_q2 = (cbcr_diff_q12 + (1 << 9)) >> 10;
+  const int cr_diff_q2 = (cr_diff_q12 + (1 << 9)) >> 10;
+  const int skin_diff =
+      skin_inv_cov[0] * cb_diff_q2 + skin_inv_cov[1] * cbcr_diff_q2 +
+      skin_inv_cov[2] * cbcr_diff_q2 + skin_inv_cov[3] * cr_diff_q2;
+  return skin_diff;
+}
+
+// Checks if the input yCbCr values corresponds to skin color.
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion) {
+  if (y < y_low || y > y_high) {
+    return 0;
+  } else if (MODEL_MODE == 0) {
+    return (vpx_evaluate_skin_color_difference(cb, cr, 0) < skin_threshold[0]);
+  } else {
+    int i = 0;
+    // Exit on grey.
+    if (cb == 128 && cr == 128) return 0;
+    // Exit on very strong cb.
+    if (cb > 150 && cr < 110) return 0;
+    for (; i < 5; ++i) {
+      int skin_color_diff = vpx_evaluate_skin_color_difference(cb, cr, i);
+      if (skin_color_diff < skin_threshold[i + 1]) {
+        if (y < 60 && skin_color_diff > 3 * (skin_threshold[i + 1] >> 2)) {
+          return 0;
+        } else if (motion == 0 &&
+                   skin_color_diff > (skin_threshold[i + 1] >> 1)) {
+          return 0;
+        } else {
+          return 1;
+        }
+      }
+      // Exit if difference is much large than the threshold.
+      if (skin_color_diff > (skin_threshold[i + 1] << 3)) {
+        return 0;
+      }
+    }
+    return 0;
+  }
+}
diff --git a/libvpx/vpx_dsp/skin_detection.h b/libvpx/vpx_dsp/skin_detection.h
new file mode 100644
index 000000000..a2e99baf7
--- /dev/null
+++ b/libvpx/vpx_dsp/skin_detection.h
@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_SKIN_DETECTION_H_
+#define VPX_DSP_SKIN_DETECTION_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int vpx_skin_pixel(const int y, const int cb, const int cr, int motion);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_SKIN_DETECTION_H_
diff --git a/libvpx/vpx_dsp/txfm_common.h b/libvpx/vpx_dsp/txfm_common.h
index fd27f928e..d01d7085a 100644
--- a/libvpx/vpx_dsp/txfm_common.h
+++ b/libvpx/vpx_dsp/txfm_common.h
@@ -25,42 +25,42 @@
 //    printf("static const int cospi_%d_64 = %.0f;\n", i,
 //           round(16384 * cos(i*M_PI/64)));
 // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const tran_high_t cospi_1_64 = 16364;
-static const tran_high_t cospi_2_64 = 16305;
-static const tran_high_t cospi_3_64 = 16207;
-static const tran_high_t cospi_4_64 = 16069;
-static const tran_high_t cospi_5_64 = 15893;
-static const tran_high_t cospi_6_64 = 15679;
-static const tran_high_t cospi_7_64 = 15426;
-static const tran_high_t cospi_8_64 = 15137;
-static const tran_high_t cospi_9_64 = 14811;
-static const tran_high_t cospi_10_64 = 14449;
-static const tran_high_t cospi_11_64 = 14053;
-static const tran_high_t cospi_12_64 = 13623;
-static const tran_high_t cospi_13_64 = 13160;
-static const tran_high_t cospi_14_64 = 12665;
-static const tran_high_t cospi_15_64 = 12140;
-static const tran_high_t cospi_16_64 = 11585;
-static const tran_high_t cospi_17_64 = 11003;
-static const tran_high_t cospi_18_64 = 10394;
-static const tran_high_t cospi_19_64 = 9760;
-static const tran_high_t cospi_20_64 = 9102;
-static const tran_high_t cospi_21_64 = 8423;
-static const tran_high_t cospi_22_64 = 7723;
-static const tran_high_t cospi_23_64 = 7005;
-static const tran_high_t cospi_24_64 = 6270;
-static const tran_high_t cospi_25_64 = 5520;
-static const tran_high_t cospi_26_64 = 4756;
-static const tran_high_t cospi_27_64 = 3981;
-static const tran_high_t cospi_28_64 = 3196;
-static const tran_high_t cospi_29_64 = 2404;
-static const tran_high_t cospi_30_64 = 1606;
-static const tran_high_t cospi_31_64 = 804;
+static const tran_coef_t cospi_1_64 = 16364;
+static const tran_coef_t cospi_2_64 = 16305;
+static const tran_coef_t cospi_3_64 = 16207;
+static const tran_coef_t cospi_4_64 = 16069;
+static const tran_coef_t cospi_5_64 = 15893;
+static const tran_coef_t cospi_6_64 = 15679;
+static const tran_coef_t cospi_7_64 = 15426;
+static const tran_coef_t cospi_8_64 = 15137;
+static const tran_coef_t cospi_9_64 = 14811;
+static const tran_coef_t cospi_10_64 = 14449;
+static const tran_coef_t cospi_11_64 = 14053;
+static const tran_coef_t cospi_12_64 = 13623;
+static const tran_coef_t cospi_13_64 = 13160;
+static const tran_coef_t cospi_14_64 = 12665;
+static const tran_coef_t cospi_15_64 = 12140;
+static const tran_coef_t cospi_16_64 = 11585;
+static const tran_coef_t cospi_17_64 = 11003;
+static const tran_coef_t cospi_18_64 = 10394;
+static const tran_coef_t cospi_19_64 = 9760;
+static const tran_coef_t cospi_20_64 = 9102;
+static const tran_coef_t cospi_21_64 = 8423;
+static const tran_coef_t cospi_22_64 = 7723;
+static const tran_coef_t cospi_23_64 = 7005;
+static const tran_coef_t cospi_24_64 = 6270;
+static const tran_coef_t cospi_25_64 = 5520;
+static const tran_coef_t cospi_26_64 = 4756;
+static const tran_coef_t cospi_27_64 = 3981;
+static const tran_coef_t cospi_28_64 = 3196;
+static const tran_coef_t cospi_29_64 = 2404;
+static const tran_coef_t cospi_30_64 = 1606;
+static const tran_coef_t cospi_31_64 = 804;
 
 //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const tran_high_t sinpi_1_9 = 5283;
-static const tran_high_t sinpi_2_9 = 9929;
-static const tran_high_t sinpi_3_9 = 13377;
-static const tran_high_t sinpi_4_9 = 15212;
+static const tran_coef_t sinpi_1_9 = 5283;
+static const tran_coef_t sinpi_2_9 = 9929;
+static const tran_coef_t sinpi_3_9 = 13377;
+static const tran_coef_t sinpi_4_9 = 15212;
 
 #endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c
index b1744047a..93bd8f30d 100644
--- a/libvpx/vpx_dsp/variance.c
+++ b/libvpx/vpx_dsp/variance.c
@@ -8,8 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
-
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
@@ -166,7 +164,7 @@ static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
     var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
                                        bilinear_filters[yoffset]);      \
                                                                         \
-    vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W);              \
+    vpx_comp_avg_pred_c(temp3, second_pred, W, H, temp2, W);            \
                                                                         \
     return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse);       \
   }
@@ -226,9 +224,6 @@ MSE(8, 8)
 void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
                          int height, const uint8_t *ref, int ref_stride) {
   int i, j;
-  /* comp_pred and pred must be 16 byte aligned. */
-  assert(((intptr_t)comp_pred & 0xf) == 0);
-  assert(((intptr_t)pred & 0xf) == 0);
 
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
@@ -468,8 +463,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
                                               dst, dst_stride, sse);         \
@@ -488,8 +483,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
                                                dst, dst_stride, sse);        \
@@ -508,8 +503,8 @@ static void highbd_var_filter_block2d_bil_second_pass(
     highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,     \
                                               bilinear_filters[yoffset]);    \
                                                                              \
-    vpx_highbd_comp_avg_pred(temp3, second_pred, W, H,                       \
-                             CONVERT_TO_BYTEPTR(temp2), W);                  \
+    vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H,                     \
+                               CONVERT_TO_BYTEPTR(temp2), W);                \
                                                                              \
     return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, \
                                                dst, dst_stride, sse);        \
diff --git a/libvpx/vpx_dsp/variance.h b/libvpx/vpx_dsp/variance.h
index 4c482551e..100573299 100644
--- a/libvpx/vpx_dsp/variance.h
+++ b/libvpx/vpx_dsp/variance.h
@@ -74,8 +74,6 @@ typedef struct vp9_variance_vtable {
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
diff --git a/libvpx/vpx_dsp/vpx_convolve.c b/libvpx/vpx_dsp/vpx_convolve.c
index 02c5a955a..e55a963f9 100644
--- a/libvpx/vpx_dsp/vpx_convolve.c
+++ b/libvpx/vpx_dsp/vpx_convolve.c
@@ -113,135 +113,107 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const InterpKernel *const x_filters,
-                     int x0_q4, int x_step_q4,
-                     const InterpKernel *const y_filters, int y0_q4,
-                     int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[64 * 135];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= 64);
-  assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
-                 x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                y_filters, y0_q4, y_step_q4, w, h);
-}
-
 void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  (void)y0_q4;
   (void)y_step_q4;
-
-  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                 w, h);
+  convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w,
+                 h);
 }
 
 void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int x_step_q4,
-                               const int16_t *filter_y, int y_step_q4, int w,
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
                                int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
-
-  convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                     x_step_q4, w, h);
+  convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                     w, h);
 }
 
 void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
-                          const int16_t *filter_x, int x_step_q4,
-                          const int16_t *filter_y, int y_step_q4, int w,
-                          int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+                          const InterpKernel *filter, int x0_q4, int x_step_q4,
+                          int y0_q4, int y_step_q4, int w, int h) {
+  (void)x0_q4;
   (void)x_step_q4;
-
-  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
-                w, h);
+  convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w,
+                h);
 }
 
 void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
-
-  convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                    y_step_q4, w, h);
+  convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4,
+                    w, h);
 }
 
 void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-           filters_y, y0_q4, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
 }
 
 void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y,
+  vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
                   y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int filter_x_stride, const int16_t *filter_y,
-                         int filter_y_stride, int w, int h) {
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   int r;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (r = h; r > 0; --r) {
     memcpy(dst, src, w);
@@ -251,15 +223,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int filter_x_stride, const int16_t *filter_y,
-                        int filter_y_stride, int w, int h) {
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
 
   for (y = 0; y < h; ++y) {
     for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
@@ -269,53 +242,52 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 }
 
 void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                         int w, int h) {
-  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
+  vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
-  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
+  vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                       x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                             uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h) {
-  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                           x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                          int w, int h) {
-  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
+  vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                      x_step_q4, y0_q4, y_step_q4, w, h);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -417,9 +389,9 @@ static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride,
 
 static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
                             uint16_t *dst, ptrdiff_t dst_stride,
-                            const InterpKernel *const x_filters, int x0_q4,
-                            int x_step_q4, const InterpKernel *const y_filters,
-                            int y0_q4, int y_step_q4, int w, int h, int bd) {
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h, int bd) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -442,113 +414,97 @@ static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride,
   assert(x_step_q4 <= 32);
 
   highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                        temp, 64, x_filters, x0_q4, x_step_q4, w,
+                        temp, 64, filter, x0_q4, x_step_q4, w,
                         intermediate_height, bd);
   highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
-                       y_filters, y0_q4, y_step_q4, w, h, bd);
+                       filter, y0_q4, y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h, int bd) {
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                         x_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride,
                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
                                       int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
+  (void)y0_q4;
   (void)y_step_q4;
 
-  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+  highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
                             x_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride,
                                  uint16_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
                                  int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                        y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride,
                                      uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
+  (void)x0_q4;
   (void)x_step_q4;
 
-  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+  highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4,
                            y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride,
                             uint16_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
                             int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
-                  filters_y, y0_q4, y_step_q4, w, h, bd);
+  highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h, bd);
 }
 
 void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride,
                                 uint16_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int x_step_q4,
-                                const int16_t *filter_y, int y_step_q4, int w,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
                                 int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
   DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
   assert(w <= 64);
   assert(h <= 64);
 
-  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4,
-                         filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h,
+  vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4,
+                         y0_q4, y_step_q4, w, h, bd);
+  vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h,
                             bd);
 }
 
 void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
                                 uint16_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int filter_x_stride,
-                                const int16_t *filter_y, int filter_y_stride,
-                                int w, int h, int bd) {
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h, int bd) {
   int r;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (r = h; r > 0; --r) {
@@ -560,15 +516,16 @@ void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
 
 void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride,
                                uint16_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int filter_x_stride,
-                               const int16_t *filter_y, int filter_y_stride,
-                               int w, int h, int bd) {
+                               const InterpKernel *filter, int x0_q4,
+                               int x_step_q4, int y0_q4, int y_step_q4, int w,
+                               int h, int bd) {
   int x, y;
 
-  (void)filter_x;
-  (void)filter_x_stride;
-  (void)filter_y;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   for (y = 0; y < h; ++y) {
diff --git a/libvpx/vpx_dsp/vpx_convolve.h b/libvpx/vpx_dsp/vpx_convolve.h
index 1aedd32bd..7979268a9 100644
--- a/libvpx/vpx_dsp/vpx_convolve.h
+++ b/libvpx/vpx_dsp/vpx_convolve.h
@@ -19,15 +19,15 @@ extern "C" {
 
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
-                              const int16_t *filter_x, int x_step_q4,
-                              const int16_t *filter_y, int y_step_q4, int w,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
                               int h);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride,
                                      uint16_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int x_step_q4,
-                                     const int16_t *filter_y, int y_step_q4,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
                                      int w, int h, int bd);
 #endif
 
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 6ac7182ab..3b1a873cd 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -50,12 +50,13 @@ DSP_SRCS-yes += intrapred.c
 DSP_SRCS-$(HAVE_SSE) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
-DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
 DSP_SRCS-$(HAVE_VSX) += ppc/intrapred_vsx.c
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
+DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -87,6 +88,8 @@ DSP_SRCS-yes += vpx_filter.h
 
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/convolve.h
 DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/vpx_asm_stubs.c
+DSP_SRCS-$(HAVE_SSSE3) += x86/convolve_ssse3.h
+DSP_SRCS-$(HAVE_AVX2) += x86/convolve_avx2.h
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_8t_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_subpixel_bilinear_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm
@@ -104,6 +107,7 @@ DSP_SRCS-$(HAVE_NEON)  += arm/highbd_vpx_convolve_neon.c
 endif
 
 DSP_SRCS-$(HAVE_SSE2)  += x86/vpx_convolve_copy_sse2.asm
+DSP_SRCS-$(HAVE_NEON)  += arm/vpx_scaled_convolve8_neon.c
 
 ifeq ($(HAVE_NEON_ASM),yes)
 DSP_SRCS-yes += arm/vpx_convolve_copy_neon_asm$(ASM)
@@ -194,6 +198,9 @@ endif
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
@@ -207,10 +214,13 @@ DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
+DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
 
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
 
+DSP_SRCS-$(HAVE_VSX) += ppc/inv_txfm_vsx.c
+
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)   += mips/inv_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)   += mips/idct4x4_msa.c
@@ -237,6 +247,11 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct4x4_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct8x8_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct16x16_add_sse2.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_idct32x32_add_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_inv_txfm_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct4x4_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct8x8_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct16x16_add_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_idct32x32_add_sse4.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 ifeq ($(HAVE_NEON_ASM),yes)
@@ -264,18 +279,19 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += quantize.c
 DSP_SRCS-yes            += quantize.h
 
+DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_x86.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
+DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3_x86_64.asm
-DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx_x86_64.asm
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
 DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
@@ -286,6 +302,10 @@ DSP_SRCS-$(HAVE_VSX)   += ppc/hadamard_vsx.c
 
 endif  # CONFIG_VP9_ENCODER
 
+# skin detection
+DSP_SRCS-yes            += skin_detection.h
+DSP_SRCS-yes            += skin_detection.c
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
@@ -300,11 +320,15 @@ DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
+DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
+DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
+
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
 DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
@@ -325,17 +349,19 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)
 DSP_SRCS-yes            += variance.c
 DSP_SRCS-yes            += variance.h
 
+DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
+
 DSP_SRCS-$(HAVE_SSE)    += x86/variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
-DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
 
 ifeq ($(ARCH_X86_64),yes)
@@ -354,7 +380,9 @@ endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
 # Neon utilities
 DSP_SRCS-$(HAVE_NEON) += arm/mem_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/sum_neon.h
 DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h
+DSP_SRCS-$(HAVE_NEON) += arm/vpx_convolve8_neon.h
 
 # PPC VSX utilities
 DSP_SRCS-$(HAVE_VSX)  += ppc/types_vsx.h
@@ -362,6 +390,7 @@ DSP_SRCS-$(HAVE_VSX)  += ppc/transpose_vsx.h
 DSP_SRCS-$(HAVE_VSX)  += ppc/bitdepth_conversion_vsx.h
 
 # X86 utilities
+DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
 DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
diff --git a/libvpx/vpx_dsp/vpx_dsp_common.h b/libvpx/vpx_dsp/vpx_dsp_common.h
index 49d36e545..c8c852374 100644
--- a/libvpx/vpx_dsp/vpx_dsp_common.h
+++ b/libvpx/vpx_dsp/vpx_dsp_common.h
@@ -43,6 +43,8 @@ typedef int32_t tran_high_t;
 typedef int16_t tran_low_t;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+typedef int16_t tran_coef_t;
+
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
@@ -55,7 +57,6 @@ static INLINE double fclamp(double value, double low, double high) {
   return value < low ? low : (value > high ? high : value);
 }
 
-#if CONFIG_VP9_HIGHBITDEPTH
 static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
   switch (bd) {
     case 8:
@@ -64,7 +65,6 @@ static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
     case 12: return (uint16_t)clamp(val, 0, 4095);
   }
 }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index c67483641..1a743d910 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_dsp_forward_decls() {
 print <<EOF
 /*
@@ -6,6 +16,7 @@ print <<EOF
 
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 EOF
 }
@@ -19,6 +30,7 @@ if ($opts{arch} eq "x86_64") {
   $ssse3_x86_64 = 'ssse3';
   $avx_x86_64 = 'avx';
   $avx2_x86_64 = 'avx2';
+  $avx512_x86_64 = 'avx512';
 }
 
 #
@@ -188,21 +200,25 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
@@ -214,30 +230,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_dc_top_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_dc_left_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/;
+  specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
@@ -249,30 +269,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_dc_top_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_dc_left_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/;
+  specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
@@ -284,30 +308,34 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_dc_top_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_dc_left_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/;
+  specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d45_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_h_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d135_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+  specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
@@ -319,81 +347,81 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_dc_top_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_dc_left_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/;
+  specialize qw/vpx_highbd_dc_128_predictor_32x32 neon sse2/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
 
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa vsx/;
+add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx/;
 
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
+add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/vpx_scaled_2d ssse3 neon msa/;
 
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Sub Pixel Filters
   #
-  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve_copy sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve_avg sse2 avx2 neon/;
 
-  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8 avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8_vert avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8_avg avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8_avg_horiz avx2 neon/, "$sse2_x86_64";
 
-  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+  add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve8_avg_vert avx2 neon/, "$sse2_x86_64";
 }  # CONFIG_VP9_HIGHBITDEPTH
 
@@ -487,28 +515,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct4x4 neon sse2/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 neon sse2/;
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8_1 neon sse2/;
+  specialize qw/vpx_fdct8x8_1 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2/;
+  specialize qw/vpx_fdct16x16 neon sse2/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2/;
+  specialize qw/vpx_fdct32x32 neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2/;
+  specialize qw/vpx_fdct32x32_rd neon sse2/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct4x4 sse2/;
@@ -517,6 +545,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_fdct8x8 sse2/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct8x8_1 neon/;
+  $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct16x16 sse2/;
@@ -535,7 +565,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct4x4 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4_1 sse2/;
+  specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
@@ -544,19 +574,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 msa/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16_1 sse2 msa/;
+  specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_1 sse2 msa/;
+  specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
@@ -581,25 +611,24 @@ add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest,
 add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
 
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-  # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vpx_idct4x4_16_add neon sse2/;
+  # Note that there are more specializations appended when
+  # CONFIG_VP9_HIGHBITDEPTH is off.
+  specialize qw/vpx_idct4x4_16_add neon sse2 vsx/;
   specialize qw/vpx_idct4x4_1_add neon sse2/;
-  specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/;
+  specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
   specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
   specialize qw/vpx_idct8x8_1_add neon sse2/;
-  specialize qw/vpx_idct16x16_256_add neon sse2/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
   specialize qw/vpx_idct16x16_38_add neon sse2/;
-  $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
   specialize qw/vpx_idct16x16_10_add neon sse2/;
   specialize qw/vpx_idct16x16_1_add neon sse2/;
-  specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
   specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
-  $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
-    # Note that these specializations appends to the above ones.
+    # Note that these specializations are appended to the above ones.
     specialize qw/vpx_idct4x4_16_add dspr2 msa/;
     specialize qw/vpx_idct4x4_1_add dspr2 msa/;
     specialize qw/vpx_idct8x8_64_add dspr2 msa/;
@@ -652,16 +681,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd";
 
   if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
-    specialize qw/vpx_highbd_idct4x4_16_add neon sse2/;
-    specialize qw/vpx_highbd_idct8x8_64_add neon sse2/;
-    specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
-    specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
-    specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
-    $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
-    specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
-    specialize qw/vpx_highbd_idct32x32_1024_add neon/;
-    specialize qw/vpx_highbd_idct32x32_135_add neon/;
-    specialize qw/vpx_highbd_idct32x32_34_add neon/;
+    specialize qw/vpx_highbd_idct4x4_16_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_64_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct8x8_12_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_256_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_38_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct16x16_10_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_1024_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_135_add neon sse2 sse4_1/;
+    specialize qw/vpx_highbd_idct32x32_34_add neon sse2 sse4_1/;
   }  # !CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9
@@ -671,10 +699,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
@@ -690,49 +718,49 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa sse2/;
+specialize qw/vpx_subtract_block neon msa mmi sse2/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 avx2 neon msa sse2 vsx/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 avx2 msa sse2 vsx/;
+specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 msa sse2 vsx/;
+specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx/;
+specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x16 neon msa sse2/;
+specialize qw/vpx_sad8x16 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2/;
+specialize qw/vpx_sad8x8 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x4 msa sse2/;
+specialize qw/vpx_sad8x4 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x8 msa sse2/;
+specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad4x4 neon msa sse2/;
+specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
 #
 # Avg
@@ -748,23 +776,23 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   specialize qw/vpx_minmax_8x8 sse2 neon msa/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 sse2 neon vsx/;
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
-    specialize qw/vpx_satd sse2 neon/;
+    specialize qw/vpx_satd avx2 sse2 neon/;
   } else {
-    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+    add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
 
-    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 sse2 neon msa vsx/;
+    add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
 
     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
-    specialize qw/vpx_satd sse2 neon msa/;
+    specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
   add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
@@ -778,138 +806,120 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa sse2/;
+specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa sse2/;
+specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa sse2/;
+specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa sse2/;
+specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa sse2/;
+specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa sse2/;
+specialize qw/vpx_sad8x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa sse2/;
+specialize qw/vpx_sad8x4_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa sse2/;
+specialize qw/vpx_sad4x8_avg neon msa sse2 mmi/;
 
 add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa sse2/;
+specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
 add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
 
 add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
 
 add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa/;
+specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
 
 add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
+specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
 
 add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
+specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
 
 # Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
 add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa/;
+specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x8 msa/;
+specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
 
 add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
 add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa sse2/;
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa sse2/;
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2/;
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa sse2/;
+specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa sse2/;
+specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2/;
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa sse2/;
+specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
 add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa sse2/;
+specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa sse2/;
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa sse2/;
+specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa sse2/;
+specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
 add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa sse2/;
+specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
@@ -1016,43 +1026,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 
   #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-
-  #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
@@ -1109,43 +1082,43 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 neon msa/;
+  specialize qw/vpx_variance32x64 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 neon msa/;
+  specialize qw/vpx_variance16x32 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 neon msa/;
+  specialize qw/vpx_variance16x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa/;
+  specialize qw/vpx_variance8x16 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa/;
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa/;
+  specialize qw/vpx_variance8x4 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa/;
+  specialize qw/vpx_variance4x8 sse2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa/;
+  specialize qw/vpx_variance4x4 sse2 neon msa mmi/;
 
 #
 # Specialty Variance
@@ -1157,16 +1130,16 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
   specialize qw/vpx_get8x8var sse2 neon msa/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
+  specialize qw/vpx_mse16x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
+  specialize qw/vpx_mse8x16 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
+  specialize qw/vpx_mse8x8 sse2 msa mmi/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
@@ -1175,88 +1148,88 @@ add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred sse2 vsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
 
 #
 # Subpixel Variance
 #
 add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance64x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 neon msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x64 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x32 neon avx2 msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance32x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance32x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x32 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance16x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance16x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x16 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance8x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance8x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x8 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x8 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance4x4 msa sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance4x4 neon msa mmi sse2 ssse3/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
diff --git a/libvpx/vpx_dsp/vpx_filter.h b/libvpx/vpx_dsp/vpx_filter.h
index 26d690501..6cea251bc 100644
--- a/libvpx/vpx_dsp/vpx_filter.h
+++ b/libvpx/vpx_dsp/vpx_filter.h
@@ -26,17 +26,6 @@ extern "C" {
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
-static INLINE const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static INLINE int get_filter_offset(const int16_t *f,
-                                    const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/libvpx/vpx_dsp/x86/add_noise_sse2.asm b/libvpx/vpx_dsp/x86/add_noise_sse2.asm
index f758da22d..80cced4ce 100644
--- a/libvpx/vpx_dsp/x86/add_noise_sse2.asm
+++ b/libvpx/vpx_dsp/x86/add_noise_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;void vpx_plane_add_noise_sse2(uint8_t *start, const int8_t *noise,
 ;                              int blackclamp, int whiteclamp,
 ;                              int width, int height, int pitch)
@@ -26,13 +28,13 @@ sym(vpx_plane_add_noise_sse2):
     mov         rdx, 0x01010101
     mov         rax, arg(2)
     mul         rdx
-    movd        xmm3, rax
+    movq        xmm3, rax
     pshufd      xmm3, xmm3, 0  ; xmm3 is 16 copies of char in blackclamp
 
     mov         rdx, 0x01010101
     mov         rax, arg(3)
     mul         rdx
-    movd        xmm4, rax
+    movq        xmm4, rax
     pshufd      xmm4, xmm4, 0  ; xmm4 is 16 copies of char in whiteclamp
 
     movdqu      xmm5, xmm3     ; both clamp = black clamp + white clamp
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 000000000..ff19ea647
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_ports/mem.h"
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+static void hadamard_8x8x2_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+void vpx_hadamard_16x16_avx2(int16_t const *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int idx;
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+#else
+  int16_t *t_coeff = coeff;
+#endif
+
+  for (idx = 0; idx < 2; ++idx) {
+    int16_t const *src_ptr = src_diff + idx * 8 * src_stride;
+    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+int vpx_satd_avx2(const tran_low_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 16) {
+    const __m256i src_line = load_tran_low(coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 4e89e07e5..a235ba41d 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -214,7 +214,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
   }
 }
 
-void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_8x8_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
                            tran_low_t *coeff) {
   __m128i src[8];
   src[0] = _mm_load_si128((const __m128i *)src_diff);
@@ -246,7 +246,7 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
   store_tran_low(src[7], coeff);
 }
 
-void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+void vpx_hadamard_16x16_sse2(int16_t const *src_diff, ptrdiff_t src_stride,
                              tran_low_t *coeff) {
   int idx;
   for (idx = 0; idx < 4; ++idx) {
diff --git a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
index b9116f049..3552c07cd 100644
--- a/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
+++ b/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -21,10 +21,24 @@
 static INLINE __m256i load_tran_low(const tran_low_t *a) {
 #if CONFIG_VP9_HIGHBITDEPTH
   const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
-  return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
 #else
   return _mm256_loadu_si256((const __m256i *)a);
 #endif
 }
 
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+#else
+  _mm256_storeu_si256((__m256i *)b, a);
+#endif
+}
 #endif  // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/libvpx/vpx_dsp/x86/convolve.h b/libvpx/vpx_dsp/x86/convolve.h
index e69d6c617..68d7589d4 100644
--- a/libvpx/vpx_dsp/x86/convolve.h
+++ b/libvpx/vpx_dsp/x86/convolve.h
@@ -20,14 +20,15 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
                                 uint32_t output_height, const int16_t *filter);
 
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+#define FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)         \
   void vpx_convolve8_##name##_##opt(                                         \
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    (void)filter_x;                                                          \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,    \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {               \
+    const int16_t *filter = filter_kernel[offset];                           \
+    (void)x0_q4;                                                             \
     (void)x_step_q4;                                                         \
-    (void)filter_y;                                                          \
+    (void)y0_q4;                                                             \
     (void)y_step_q4;                                                         \
     assert(filter[3] != 128);                                                \
     assert(step_q4 == 16);                                                   \
@@ -64,32 +65,36 @@ typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
     }                                                                        \
   }
 
-#define FUN_CONV_2D(avg, opt)                                                 \
-  void vpx_convolve8_##avg##opt(                                              \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                 \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {                 \
-    assert(filter_x[3] != 128);                                               \
-    assert(filter_y[3] != 128);                                               \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    assert(x_step_q4 == 16);                                                  \
-    assert(y_step_q4 == 16);                                                  \
-    if (filter_x[0] | filter_x[1] | filter_x[2]) {                            \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                          \
-      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, w,  \
-                                h + 7);                                       \
-      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,          \
-                                      y_step_q4, w, h);                       \
-    } else {                                                                  \
-      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                          \
-      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter_x,        \
-                                x_step_q4, filter_y, y_step_q4, w, h + 1);    \
-      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter_x,  \
-                                      x_step_q4, filter_y, y_step_q4, w, h);  \
-    }                                                                         \
+#define FUN_CONV_2D(avg, opt)                                                  \
+  void vpx_convolve8_##avg##opt(                                               \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                  \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h) {                 \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    const int16_t *filter_y = filter[y0_q4];                                   \
+    (void)filter_y;                                                            \
+    assert(filter_x[3] != 128);                                                \
+    assert(filter_y[3] != 128);                                                \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    assert(x_step_q4 == 16);                                                   \
+    assert(y_step_q4 == 16);                                                   \
+    if (filter_x[0] | filter_x[1] | filter_x[2]) {                             \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]);                           \
+      vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64,  \
+                                filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, \
+                                h + 7);                                        \
+      vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride,    \
+                                      filter, x0_q4, x_step_q4, y0_q4,         \
+                                      y_step_q4, w, h);                        \
+    } else {                                                                   \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]);                           \
+      vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, x0_q4,    \
+                                x_step_q4, y0_q4, y_step_q4, w, h + 1);        \
+      vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, filter,     \
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w,   \
+                                      h);                                      \
+    }                                                                          \
   }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -101,95 +106,97 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        unsigned int output_height,
                                        const int16_t *filter, int bd);
 
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vpx_highbd_convolve8_##name##_##opt(                               \
-      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,           \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,       \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {     \
-    if (step_q4 == 16 && filter[3] != 128) {                              \
-      if (filter[0] | filter[1] | filter[2]) {                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                 \
-              src_start, src_stride, dst, dst_stride, h, filter, bd);     \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      } else {                                                            \
-        while (w >= 16) {                                                 \
-          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 16;                                                      \
-          dst += 16;                                                      \
-          w -= 16;                                                        \
-        }                                                                 \
-        while (w >= 8) {                                                  \
-          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 8;                                                       \
-          dst += 8;                                                       \
-          w -= 8;                                                         \
-        }                                                                 \
-        while (w >= 4) {                                                  \
-          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                 \
-              src, src_stride, dst, dst_stride, h, filter, bd);           \
-          src += 4;                                                       \
-          dst += 4;                                                       \
-          w -= 4;                                                         \
-        }                                                                 \
-      }                                                                   \
-    }                                                                     \
-    if (w) {                                                              \
-      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,   \
-                                      filter_x, x_step_q4, filter_y,      \
-                                      y_step_q4, w, h, bd);               \
-    }                                                                     \
-  }
-
-#define HIGH_FUN_CONV_2D(avg, opt)                                            \
-  void vpx_highbd_convolve8_##avg##opt(                                       \
+#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt)     \
+  void vpx_highbd_convolve8_##name##_##opt(                                   \
       const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,           \
-      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {         \
-    assert(w <= 64);                                                          \
-    assert(h <= 64);                                                          \
-    if (x_step_q4 == 16 && y_step_q4 == 16) {                                 \
-      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {  \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                       \
-        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,    \
-                                         fdata2, 64, filter_x, x_step_q4,     \
-                                         filter_y, y_step_q4, w, h + 7, bd);  \
-        vpx_highbd_convolve8_##avg##vert_##opt(                               \
-            fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \
-            y_step_q4, w, h, bd);                                             \
+      ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4,     \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {        \
+    const int16_t *filter = filter_kernel[offset];                            \
+    if (step_q4 == 16 && filter[3] != 128) {                                  \
+      if (filter[0] | filter[1] | filter[2]) {                                \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##8_##avg##opt(                    \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##8_##avg##opt(                     \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);         \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
       } else {                                                                \
-        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                       \
-        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64,         \
-                                         filter_x, x_step_q4, filter_y,       \
-                                         y_step_q4, w, h + 1, bd);            \
-        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,   \
-                                               filter_x, x_step_q4, filter_y, \
-                                               y_step_q4, w, h, bd);          \
+        while (w >= 16) {                                                     \
+          vpx_highbd_filter_block1d16_##dir##2_##avg##opt(                    \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 16;                                                          \
+          dst += 16;                                                          \
+          w -= 16;                                                            \
+        }                                                                     \
+        while (w >= 8) {                                                      \
+          vpx_highbd_filter_block1d8_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 8;                                                           \
+          dst += 8;                                                           \
+          w -= 8;                                                             \
+        }                                                                     \
+        while (w >= 4) {                                                      \
+          vpx_highbd_filter_block1d4_##dir##2_##avg##opt(                     \
+              src, src_stride, dst, dst_stride, h, filter, bd);               \
+          src += 4;                                                           \
+          dst += 4;                                                           \
+          w -= 4;                                                             \
+        }                                                                     \
       }                                                                       \
-    } else {                                                                  \
-      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride,         \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd);                                \
     }                                                                         \
+    if (w) {                                                                  \
+      vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride,       \
+                                      filter_kernel, x0_q4, x_step_q4, y0_q4, \
+                                      y_step_q4, w, h, bd);                   \
+    }                                                                         \
+  }
+
+#define HIGH_FUN_CONV_2D(avg, opt)                                             \
+  void vpx_highbd_convolve8_##avg##opt(                                        \
+      const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst,                \
+      ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4,             \
+      int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) {         \
+    const int16_t *filter_x = filter[x0_q4];                                   \
+    assert(w <= 64);                                                           \
+    assert(h <= 64);                                                           \
+    if (x_step_q4 == 16 && y_step_q4 == 16) {                                  \
+      if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) {   \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride,     \
+                                         fdata2, 64, filter, x0_q4, x_step_q4, \
+                                         y0_q4, y_step_q4, w, h + 7, bd);      \
+        vpx_highbd_convolve8_##avg##vert_##opt(                                \
+            fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4,       \
+            y0_q4, y_step_q4, w, h, bd);                                       \
+      } else {                                                                 \
+        DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]);                        \
+        vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter,  \
+                                         x0_q4, x_step_q4, y0_q4, y_step_q4,   \
+                                         w, h + 1, bd);                        \
+        vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride,    \
+                                               filter, x0_q4, x_step_q4,       \
+                                               y0_q4, y_step_q4, w, h, bd);    \
+      }                                                                        \
+    } else {                                                                   \
+      vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter,  \
+                                    x0_q4, x_step_q4, y0_q4, y_step_q4, w, h,  \
+                                    bd);                                       \
+    }                                                                          \
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h
new file mode 100644
index 000000000..bc96b738f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_AVX2_H_
+#define VPX_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_config.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void shuffle_filter_avx2(const int16_t *const filter,
+                                       __m256i *const f) {
+  const __m256i f_values =
+      MM256_BROADCASTSI128_SI256(_mm_load_si128((const __m128i *)filter));
+  // pack and duplicate the filter values
+  f[0] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0200u));
+  f[1] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0604u));
+  f[2] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0a08u));
+  f[3] = _mm256_shuffle_epi8(f_values, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m256i convolve8_16_avx2(const __m256i *const s,
+                                        const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m256i k_64 = _mm256_set1_epi16(1 << 6);
+  const __m256i x0 = _mm256_maddubs_epi16(s[0], f[0]);
+  const __m256i x1 = _mm256_maddubs_epi16(s[1], f[1]);
+  const __m256i x2 = _mm256_maddubs_epi16(s[2], f[2]);
+  const __m256i x3 = _mm256_maddubs_epi16(s[3], f[3]);
+  __m256i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm256_add_epi16(x0, x2);
+  sum2 = _mm256_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm256_add_epi16(sum1, k_64);
+  sum1 = _mm256_adds_epi16(sum1, sum2);
+  // round and shift by 7 bit each 16 bit
+  sum1 = _mm256_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_avx2(const __m256i *const s,
+                                       const __m256i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[0]),
+                                       _mm256_castsi256_si128(f[0]));
+  const __m128i x1 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[1]),
+                                       _mm256_castsi256_si128(f[1]));
+  const __m128i x2 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[2]),
+                                       _mm256_castsi256_si128(f[2]));
+  const __m128i x3 = _mm_maddubs_epi16(_mm256_castsi256_si128(s[3]),
+                                       _mm256_castsi256_si128(f[3]));
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+#undef MM256_BROADCASTSI128_SI256
+
+#endif  // VPX_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/libvpx/vpx_dsp/x86/convolve_ssse3.h b/libvpx/vpx_dsp/x86/convolve_ssse3.h
new file mode 100644
index 000000000..e5d452f99
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <assert.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include "./vpx_config.h"
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+                                            __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+  // half of f[0] and f[4].
+  assert(filter[3] >= 0 && filter[3] < 256);
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+  f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+                                                    const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  // compensate the subtracted 64 in f[1]. x4 is always non negative.
+  const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+  // add and saturate the results together
+  __m128i temp = _mm_adds_epi16(x0, x3);
+  temp = _mm_adds_epi16(temp, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x4);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+                                                   const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+  // compensate the subtracted 64 in f[2]. x5 is always non negative.
+  const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+  __m128i temp;
+
+  // add and saturate the results together
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x2);
+  temp = _mm_adds_epi16(temp, x3);
+  temp = _mm_adds_epi16(temp, x4);
+  temp = _mm_adds_epi16(temp, x5);
+  // round and shift by 7 bit each 16 bit
+  temp = _mm_adds_epi16(temp, k_64);
+  temp = _mm_srai_epi16(temp, 7);
+  return temp;
+}
+
+#endif  // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/libvpx/vpx_dsp/x86/deblock_sse2.asm b/libvpx/vpx_dsp/x86/deblock_sse2.asm
index bd8fd1248..97cb43b67 100644
--- a/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -83,6 +83,8 @@
         add         rbx,        16
 %endmacro
 
+SECTION .text
+
 ;void vpx_post_proc_down_and_across_mb_row_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 39d3a3f59..132e06523 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -51,7 +51,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16((int16_t)cospi_16_64);
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
   const __m256i k__cospi_p16_m16 =
       pair256_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m256i k__cospi_m08_p24 = pair256_set_epi16(-cospi_8_64, cospi_24_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index 374433390..32b9bd281 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -63,7 +63,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(+cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
index 743e55e63..f9abaecf2 100644
--- a/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h
@@ -261,7 +261,7 @@ void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
@@ -582,7 +582,7 @@ void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
   //    When we use them, in one case, they are all the same. In all others
   //    it's a pair of them that we need to repeat four times. This is done
   //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
diff --git a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index b433874f2..32824a03a 100644
--- a/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -31,8 +31,8 @@ SECTION .text
 INIT_XMM ssse3
 cglobal fdct8x8, 3, 5, 13, input, output, stride
 
-  mova               m8, [pd_8192]
-  mova              m12, [pw_11585x2]
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
 
   lea                r3, [2 * strideq]
   lea                r4, [4 * strideq]
@@ -92,10 +92,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   ; sin(pi / 8), cos(pi / 8)
   punpcklwd m2, m10, m9
   punpckhwd m10, m9
-  pmaddwd m5, m2, [pw_15137_6270]
-  pmaddwd m2, [pw_6270_m15137]
-  pmaddwd m9, m10, [pw_15137_6270]
-  pmaddwd m10, [pw_6270_m15137]
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
   paddd m5, m8
   paddd m2, m8
   paddd m9, m8
@@ -120,10 +120,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   ; sin(pi / 16), cos(pi / 16)
   punpcklwd m1, m10, m9
   punpckhwd m10, m9
-  pmaddwd m7, m1, [pw_16069_3196]
-  pmaddwd m1, [pw_3196_m16069]
-  pmaddwd m9, m10, [pw_16069_3196]
-  pmaddwd m10, [pw_3196_m16069]
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
   paddd m7, m8
   paddd m1, m8
   paddd m9, m8
@@ -138,10 +138,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   ; sin(3 * pi / 16), cos(3 * pi / 16)
   punpcklwd m11, m0, m3
   punpckhwd m0, m3
-  pmaddwd m9, m11, [pw_9102_13623]
-  pmaddwd m11, [pw_13623_m9102]
-  pmaddwd m3, m0, [pw_9102_13623]
-  pmaddwd m0, [pw_13623_m9102]
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
   paddd m9, m8
   paddd m11, m8
   paddd m3, m8
@@ -211,10 +211,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   ; stage 3
   punpcklwd m6, m1, m3
   punpckhwd m1, m3
-  pmaddwd m2, m6, [pw_11585_11585]
-  pmaddwd m6, [pw_11585_m11585]
-  pmaddwd m3, m1, [pw_11585_11585]
-  pmaddwd m1, [pw_11585_m11585]
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
   paddd m2, m8
   paddd m6, m8
   paddd m3, m8
@@ -231,10 +231,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
 
   punpcklwd m3, m5, m4
   punpckhwd m5, m4
-  pmaddwd m1, m3, [pw_15137_6270]
-  pmaddwd m3, [pw_6270_m15137]
-  pmaddwd m4, m5, [pw_15137_6270]
-  pmaddwd m5, [pw_6270_m15137]
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
   paddd m1, m8
   paddd m3, m8
   paddd m4, m8
@@ -255,10 +255,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
   ; stage 4
   punpcklwd m9, m5, m4
   punpckhwd m5, m4
-  pmaddwd m7, m9, [pw_16069_3196]
-  pmaddwd m9, [pw_3196_m16069]
-  pmaddwd m4, m5, [pw_16069_3196]
-  pmaddwd m5, [pw_3196_m16069]
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
   paddd m7, m8
   paddd m9, m8
   paddd m4, m8
@@ -272,10 +272,10 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
 
   punpcklwd m4, m10, m0
   punpckhwd m10, m0
-  pmaddwd m5, m4, [pw_9102_13623]
-  pmaddwd m4, [pw_13623_m9102]
-  pmaddwd m0, m10, [pw_9102_13623]
-  pmaddwd m10, [pw_13623_m9102]
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
   paddd m5, m8
   paddd m4, m8
   paddd m0, m8
diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 2fc7b7430..7e75d5d10 100644
--- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -18,13 +18,14 @@
 
 void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                    uint16_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int filter_x_stride,
-                                   const int16_t *filter_y, int filter_y_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
                                    int width, int h, int bd) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   assert(width % 4 == 0);
@@ -99,13 +100,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
 
 void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int filter_x_stride,
-                                  const int16_t *filter_y, int filter_y_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
                                   int width, int h, int bd) {
-  (void)filter_x;
-  (void)filter_y;
-  (void)filter_x_stride;
-  (void)filter_y_stride;
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
   (void)bd;
 
   assert(width % 4 == 0);
@@ -1073,8 +1075,8 @@ void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2
 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2
 
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
 HIGH_FUN_CONV_2D(, avx2);
 
 void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t,
@@ -1098,8 +1100,8 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t,
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
                  avx2);
 HIGH_FUN_CONV_2D(avg_, avx2);
 
diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
index a2412d124..f4f7235d1 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse2.c
@@ -8,237 +8,343 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse2(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse2(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse2(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse2(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse2(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse2(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[7], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(io[6], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse2(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2], sign[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(io[3], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse2(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[11]);  // step1[10] = -step1[10]
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[12]);  // step1[13] = -step1[13]
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  abs_extend_64bit_sse2(io[0], temp, sign);
+  step2[0] = multiplication_round_shift_sse2(temp, sign, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
 void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint16_t *dest,
                                        int stride, int bd) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
+  int i;
+  __m128i out[16], *in;
 
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 32; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 32; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
     }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_16x16(inptr, inptr + 16);
-      for (i = 0; i < 16; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
+      dest += 8;
     }
   } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 16; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
-    }
-  }
+    __m128i all[4][16];
 
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
-      }
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      highbd_idct16_4col(in);
+      input += 4 * 16;
     }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct16_4col(out);
+
       for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
+      dest += 4;
     }
   }
 }
 
-void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+void vpx_highbd_idct16x16_38_add_sse2(const tran_low_t *input, uint16_t *dest,
                                       int stride, int bd) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[32];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i rounding = _mm_set1_epi16(32);
-  const __m128i max = _mm_set1_epi16(3155);
-  const __m128i min = _mm_set1_epi16(-3155);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 16; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 8));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 16 * i + 12));
-    inptr[i + 16] = _mm_packs_epi32(temp1, temp2);
-  }
+  int i;
+  __m128i out[16];
 
-  // Find the min & max for the row transform
-  // Since all non-zero dct coefficients are in upper-left 4x4 area,
-  // we only need to consider first 4 rows here.
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform (N.B. This transposes inptr)
-    idct16_sse2(inptr, inptr + 16);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 16; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(input, 16, in);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
     }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_8x8(inptr, inptr);
-      array_transpose_8x8(inptr + 8, inptr + 16);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 1)), temp2);
-        sign_bits = _mm_cmplt_epi16(inptr[i + 16], zero);
-        temp1 = _mm_unpacklo_epi16(inptr[i + 16], sign_bits);
-        temp2 = _mm_unpackhi_epi16(inptr[i + 16], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 2)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (i * 4 + 3)), temp2);
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
       }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
+      dest += 8;
     }
   } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct16_c(input, outptr, bd);
-      input += 16;
-      outptr += 16;
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
     }
   }
+}
+
+void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
 
-  if (optimised_cols) {
-    idct16_sse2(inptr, inptr + 16);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[2];
-      for (i = 0; i < 16; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], rounding);
-        inptr[i + 16] = _mm_add_epi16(inptr[i + 16], rounding);
-        d[0] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        d[1] = _mm_loadu_si128((const __m128i *)(dest + stride * i + 8));
-        inptr[i] = _mm_srai_epi16(inptr[i], 6);
-        inptr[i + 16] = _mm_srai_epi16(inptr[i + 16], 6);
-        d[0] = clamp_high_sse2(_mm_add_epi16(d[0], inptr[i]), bd);
-        d[1] = clamp_high_sse2(_mm_add_epi16(d[1], inptr[i + 16]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[0]);
-        _mm_storeu_si128((__m128i *)(dest + stride * i + 8), d[1]);
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
       }
+      dest += 8;
     }
   } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[16], temp_out[16];
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-      vpx_highbd_idct16_c(temp_in, temp_out, bd);
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
       for (j = 0; j < 16; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
       }
+      dest += 4;
     }
   }
 }
diff --git a/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
new file mode 100644
index 000000000..de097c66a
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct16x16_add_sse4.c
@@ -0,0 +1,349 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct16_4col_stage5(const __m128i *const in,
+                                             __m128i *const out) {
+  // stage 5
+  out[0] = _mm_add_epi32(in[0], in[3]);
+  out[1] = _mm_add_epi32(in[1], in[2]);
+  out[2] = _mm_sub_epi32(in[1], in[2]);
+  out[3] = _mm_sub_epi32(in[0], in[3]);
+  highbd_butterfly_cospi16_sse4_1(in[6], in[5], &out[6], &out[5]);
+  out[8] = _mm_add_epi32(in[8], in[11]);
+  out[9] = _mm_add_epi32(in[9], in[10]);
+  out[10] = _mm_sub_epi32(in[9], in[10]);
+  out[11] = _mm_sub_epi32(in[8], in[11]);
+  out[12] = _mm_sub_epi32(in[15], in[12]);
+  out[13] = _mm_sub_epi32(in[14], in[13]);
+  out[14] = _mm_add_epi32(in[14], in[13]);
+  out[15] = _mm_add_epi32(in[15], in[12]);
+}
+
+static INLINE void highbd_idct16_4col_stage6(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+  out[8] = in[8];
+  out[9] = in[9];
+  highbd_butterfly_cospi16_sse4_1(in[13], in[10], &out[13], &out[10]);
+  highbd_butterfly_cospi16_sse4_1(in[12], in[11], &out[12], &out[11]);
+  out[14] = in[14];
+  out[15] = in[15];
+}
+
+static INLINE void highbd_idct16_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  highbd_butterfly_sse4_1(io[1], io[15], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(io[9], io[7], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(io[5], io[11], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(io[13], io[3], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  highbd_butterfly_sse4_1(io[2], io[14], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[10], io[6], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  highbd_butterfly_cospi16_sse4_1(io[0], io[8], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(io[4], io[12], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_38_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp1[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[7], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(io[5], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(io[6], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+
+  // stage 4
+  extend_64bit(io[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  step2[1] = step2[0];
+  highbd_partial_butterfly_sse4_1(io[4], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step1[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step1[7] = _mm_add_epi32(step1[7], step1[6]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+static INLINE void highbd_idct16x16_10_4col(__m128i *const io /*io[16]*/) {
+  __m128i step1[16], step2[16];
+  __m128i temp[2];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(io[1], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(io[3], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(io[2], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+
+  // stage 4
+  extend_64bit(io[0], temp);
+  step2[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  step2[1] = step2[0];
+  step2[2] = _mm_setzero_si128();
+  step2[3] = _mm_setzero_si128();
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[10], step1[13], -cospi_8_64, -cospi_24_64,
+                          &step2[13], &step2[10]);
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  highbd_idct16_4col_stage5(step2, step1);
+  highbd_idct16_4col_stage6(step1, step2);
+  highbd_idct16_4col_stage7(step2, io);
+}
+
+void vpx_highbd_idct16x16_256_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i;
+  __m128i out[16], *in;
+
+  if (bd == 8) {
+    __m128i l[16], r[16];
+
+    in = l;
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+      idct16_8col(in, in);
+      in = r;
+      input += 128;
+    }
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(l + i, out);
+      transpose_16bit_8x8(r + i, out + 8);
+      idct16_8col(out, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[4][16];
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+      highbd_idct16_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct16_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_38_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], temp[16];
+
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+    for (i = 8; i < 16; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+    idct16_8col(in, temp);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      transpose_16bit_8x8(temp + i, in);
+      idct16_8col(in, out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(input, 16, in);
+      highbd_idct16x16_38_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      highbd_idct16x16_38_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+void vpx_highbd_idct16x16_10_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i;
+  __m128i out[16];
+
+  if (bd == 8) {
+    __m128i in[16], l[16];
+
+    in[0] = load_pack_8_32bit(input + 0 * 16);
+    in[1] = load_pack_8_32bit(input + 1 * 16);
+    in[2] = load_pack_8_32bit(input + 2 * 16);
+    in[3] = load_pack_8_32bit(input + 3 * 16);
+
+    idct16x16_10_pass1(in, l);
+
+    for (i = 0; i < 16; i += 8) {
+      int j;
+      idct16x16_10_pass2(l + i, in);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_8(dest + j * stride, in[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[2][16], *in;
+
+    for (i = 0; i < 2; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_4x4(input, 16, in);
+      highbd_idct16x16_10_4col(in);
+      input += 4 * 16;
+    }
+
+    for (i = 0; i < 16; i += 4) {
+      int j;
+      transpose_32bit_4x4(&all[0][i], out);
+      highbd_idct16x16_10_4col(out);
+
+      for (j = 0; j < 16; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
index 06f265918..c710e8995 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse2.c
@@ -14,6 +14,768 @@
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64,
+                        &step2[13], &step2[10]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                        &out[10], &out[13]);
+  highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                        &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[20], step1[23]);  // step2[20] = -step2[20]
+  step2[21] = _mm_sub_epi32(step1[21], step1[22]);  // step2[21] = -step2[21]
+  step2[22] = _mm_add_epi32(step1[21], step1[22]);
+  step2[23] = _mm_add_epi32(step1[20], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[27], step1[24]);
+  step2[25] = _mm_add_epi32(step1[26], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[26], step1[25]);  // step2[26] = -step2[26]
+  step2[27] = _mm_sub_epi32(step1[27], step1[24]);  // step2[27] = -step2[27]
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse2(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                        &step1[18], &step1[29]);
+  highbd_butterfly_sse2(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                        &step1[19], &step1[28]);
+  highbd_butterfly_sse2(step2[20], step2[27], cospi_8_64, cospi_24_64,
+                        &step1[27], &step1[20]);
+  highbd_butterfly_sse2(step2[21], step2[26], cospi_8_64, cospi_24_64,
+                        &step1[26], &step1[21]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse2(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                        &out[20], &out[27]);
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                        &out[21], &out[26]);
+  highbd_butterfly_sse2(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                        &out[22], &out[25]);
+  highbd_butterfly_sse2(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                        &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse2(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse2(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                        &step2[0]);
+  highbd_butterfly_sse2(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                        &step2[15]);
+  highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                        &step2[14]);
+  highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                        &step2[13]);
+  highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                        &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse2(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                        &step1[31]);
+  highbd_butterfly_sse2(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                        &step1[30]);
+  highbd_butterfly_sse2(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                        &step1[29]);
+  highbd_butterfly_sse2(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                        &step1[28]);
+
+  highbd_butterfly_sse2(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                        &step1[27]);
+  highbd_butterfly_sse2(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                        &step1[26]);
+
+  highbd_butterfly_sse2(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                        &step1[25]);
+  highbd_butterfly_sse2(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                        &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+  highbd_partial_butterfly_neg_sse2(in[12], cospi_12_64, cospi_20_64, &step1[5],
+                                    &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  highbd_partial_butterfly_sse2(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[14], cospi_14_64, cospi_18_64, &step2[9],
+                                    &step2[14]);
+  highbd_partial_butterfly_sse2(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                &step2[13]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[10], step2[11]);  // step1[10] = -step1[10]
+  step1[11] = _mm_add_epi32(step2[10], step2[11]);
+  step1[12] = _mm_add_epi32(step2[13], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[13], step2[12]);  // step1[13] = -step1[13]
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[15], cospi_15_64, cospi_17_64,
+                                    &step1[17], &step1[30]);
+  highbd_partial_butterfly_sse2(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                &step1[29]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[11], cospi_11_64, cospi_21_64,
+                                    &step1[21], &step1[26]);
+
+  highbd_partial_butterfly_sse2(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                &step1[25]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[18], step1[19]);  // step2[18] = -step2[18]
+  step2[19] = _mm_add_epi32(step1[18], step1[19]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[22], step1[23]);  // step2[22] = -step2[22]
+  step2[23] = _mm_add_epi32(step1[22], step1[23]);
+
+  step2[24] = _mm_add_epi32(step1[25], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[25], step1[24]);  // step2[25] = -step2[25]
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[29], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[29], step1[28]);  // step2[29] = -step2[29]
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                       int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    for (i = 16; i < 32; i++) {
+      in[i] = _mm_setzero_si128();
+    }
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_1024_8x32(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_1024_8x32(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse2(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse2(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse2(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+                        &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                &step2[15]);
+  highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11],
+                                    &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  step1[10] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[10]);  // step1[10] = -step1[10]
+  step1[13] =
+      _mm_sub_epi32(_mm_setzero_si128(), step1[13]);  // step1[13] = -step1[13]
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse2(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                &step1[31]);
+  highbd_partial_butterfly_neg_sse2(in[7], cospi_7_64, cospi_25_64, &step1[19],
+                                    &step1[28]);
+
+  highbd_partial_butterfly_sse2(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                &step1[27]);
+  highbd_partial_butterfly_neg_sse2(in[3], cospi_3_64, cospi_29_64, &step1[23],
+                                    &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step2[18] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[18]);  // step2[18] = -step2[18]
+  step2[22] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[22]);  // step2[22] = -step2[22]
+  step2[25] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[25]);  // step2[25] = -step2[25]
+  step2[29] =
+      _mm_sub_epi32(_mm_setzero_si128(), step2[29]);  // step2[29] = -step2[29]
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse2(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                        &step1[17], &step1[30]);
+  highbd_butterfly_sse2(step2[18], step2[29], cospi_4_64, cospi_28_64,
+                        &step1[29], &step1[18]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse2(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                        &step1[21], &step1[26]);
+  highbd_butterfly_sse2(step2[22], step2[25], cospi_20_64, cospi_12_64,
+                        &step1[25], &step1[22]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse2(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_sse2(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_sse2(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
 void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest,
                                      int stride, int bd) {
   highbd_idct_1_add_kernel(input, dest, stride, bd, 32);
diff --git a/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
new file mode 100644
index 000000000..2d0a53ac0
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct32x32_add_sse4.c
@@ -0,0 +1,765 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  highbd_butterfly_sse4_1(step1[14], step1[9], cospi_24_64, cospi_8_64,
+                          &step2[9], &step2[14]);
+  highbd_butterfly_sse4_1(step1[13], step1[10], -cospi_8_64, cospi_24_64,
+                          &step2[10], &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi32(step2[8], step2[11]);
+  step1[9] = _mm_add_epi32(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi32(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi32(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi32(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi32(step2[14], step2[13]);
+  step1[14] = _mm_add_epi32(step2[14], step2[13]);
+  step1[15] = _mm_add_epi32(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  highbd_butterfly_sse4_1(step1[13], step1[10], cospi_16_64, cospi_16_64,
+                          &out[10], &out[13]);
+  highbd_butterfly_sse4_1(step1[12], step1[11], cospi_16_64, cospi_16_64,
+                          &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void highbd_idct32_4x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi32(step1[16], step1[19]);
+  step2[17] = _mm_add_epi32(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi32(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi32(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi32(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[22], step1[21]);
+  step2[22] = _mm_add_epi32(step1[22], step1[21]);
+  step2[23] = _mm_add_epi32(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[27]);
+  step2[25] = _mm_add_epi32(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi32(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi32(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi32(step1[30], step1[29]);
+  step2[30] = _mm_add_epi32(step1[29], step1[30]);
+  step2[31] = _mm_add_epi32(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  highbd_butterfly_sse4_1(step2[29], step2[18], cospi_24_64, cospi_8_64,
+                          &step1[18], &step1[29]);
+  highbd_butterfly_sse4_1(step2[28], step2[19], cospi_24_64, cospi_8_64,
+                          &step1[19], &step1[28]);
+  highbd_butterfly_sse4_1(step2[27], step2[20], -cospi_8_64, cospi_24_64,
+                          &step1[20], &step1[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], -cospi_8_64, cospi_24_64,
+                          &step1[21], &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[16] = _mm_add_epi32(step1[16], step1[23]);
+  step2[17] = _mm_add_epi32(step1[17], step1[22]);
+  step2[18] = _mm_add_epi32(step1[18], step1[21]);
+  step2[19] = _mm_add_epi32(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi32(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi32(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi32(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi32(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi32(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi32(step1[28], step1[27]);
+  step2[28] = _mm_add_epi32(step1[27], step1[28]);
+  step2[29] = _mm_add_epi32(step1[26], step1[29]);
+  step2[30] = _mm_add_epi32(step1[25], step1[30]);
+  step2[31] = _mm_add_epi32(step1[24], step1[31]);
+
+  // stage 7
+  out[16] = step2[16];
+  out[17] = step2[17];
+  out[18] = step2[18];
+  out[19] = step2[19];
+  highbd_butterfly_sse4_1(step2[27], step2[20], cospi_16_64, cospi_16_64,
+                          &out[20], &out[27]);
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_16_64, cospi_16_64,
+                          &out[21], &out[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], cospi_16_64, cospi_16_64,
+                          &out[22], &out[25]);
+  highbd_butterfly_sse4_1(step2[24], step2[23], cospi_16_64, cospi_16_64,
+                          &out[23], &out[24]);
+  out[28] = step2[28];
+  out[29] = step2[29];
+  out[30] = step2[30];
+  out[31] = step2[31];
+}
+
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_butterfly_sse4_1(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  highbd_butterfly_sse4_1(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1],
+                          &step2[0]);
+  highbd_butterfly_sse4_1(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2],
+                          &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_butterfly_sse4_1(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  highbd_butterfly_sse4_1(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9],
+                          &step2[14]);
+  highbd_butterfly_sse4_1(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  highbd_butterfly_sse4_1(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_1024_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_1024_4x32_quarter_1(in, temp);
+  highbd_idct32_1024_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_1024_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_butterfly_sse4_1(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  highbd_butterfly_sse4_1(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17],
+                          &step1[30]);
+  highbd_butterfly_sse4_1(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  highbd_butterfly_sse4_1(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19],
+                          &step1[28]);
+
+  highbd_butterfly_sse4_1(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  highbd_butterfly_sse4_1(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21],
+                          &step1[26]);
+
+  highbd_butterfly_sse4_1(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  highbd_butterfly_sse4_1(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_1024_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_1024_4x32_quarter_1_2(io, temp);
+  highbd_idct32_1024_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_1024_add_sse4_1(const tran_low_t *input,
+                                          uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[4][32], io[32];
+
+    // rows
+    for (i = 0; i < 4; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &io[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &io[8]);
+      highbd_load_pack_transpose_32bit_8x8(&input[16], 32, &io[16]);
+      highbd_load_pack_transpose_32bit_8x8(&input[24], 32, &io[24]);
+      idct32_1024_8x32(io, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      // Transpose 32x8 block to 8x32 block
+      transpose_16bit_8x8(col[0] + i, io);
+      transpose_16bit_8x8(col[1] + i, io + 8);
+      transpose_16bit_8x8(col[2] + i, io + 16);
+      transpose_16bit_8x8(col[3] + i, io + 24);
+      idct32_1024_8x32(io, io);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, io[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 8; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_load_transpose_32bit_8x4(&input[16], 32, &in[16]);
+      highbd_load_transpose_32bit_8x4(&input[24], 32, &in[24]);
+      highbd_idct32_1024_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      transpose_32bit_4x4(all[4] + i, out + 16);
+      transpose_32bit_4x4(all[5] + i, out + 20);
+      transpose_32bit_4x4(all[6] + i, out + 24);
+      transpose_32bit_4x4(all[7] + i, out + 28);
+      highbd_idct32_1024_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+  highbd_partial_butterfly_sse4_1(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                                  &step1[6]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  highbd_partial_butterfly_sse4_1(in[8], cospi_24_64, cospi_8_64, &step2[2],
+                                  &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_2(
+    const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                                  &step2[14]);
+  highbd_partial_butterfly_sse4_1(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                                  &step2[13]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi32(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi32(step2[8], step2[9]);
+  step1[14] = _mm_sub_epi32(step2[15], step2[14]);
+  step1[15] = _mm_add_epi32(step2[15], step2[14]);
+  step1[10] = _mm_sub_epi32(step2[11], step2[10]);
+  step1[11] = _mm_add_epi32(step2[11], step2[10]);
+  step1[12] = _mm_add_epi32(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi32(step2[12], step2[13]);
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_135_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_135_4x32_quarter_1(in, temp);
+  highbd_idct32_135_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_135_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                                  &step1[30]);
+  highbd_partial_butterfly_sse4_1(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                                  &step1[29]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                                  &step1[26]);
+
+  highbd_partial_butterfly_sse4_1(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                                  &step1[25]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi32(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi32(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi32(step1[19], step1[18]);
+  step2[19] = _mm_add_epi32(step1[19], step1[18]);
+  step2[20] = _mm_add_epi32(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi32(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi32(step1[23], step1[22]);
+  step2[23] = _mm_add_epi32(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi32(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi32(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi32(step1[27], step1[26]);
+  step2[27] = _mm_add_epi32(step1[27], step1[26]);
+  step2[28] = _mm_add_epi32(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi32(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi32(step1[31], step1[30]);
+  step2[31] = _mm_add_epi32(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_135_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_135_4x32_quarter_1_2(io, temp);
+  highbd_idct32_135_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_135_add_sse4_1(const tran_low_t *input,
+                                         uint16_t *dest, int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[2][32], in[32], out[32];
+
+    // rows
+    for (i = 0; i < 2; i++) {
+      highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+      highbd_load_pack_transpose_32bit_8x8(&input[8], 32, &in[8]);
+      idct32_135_8x32_ssse3(in, col[i]);
+      input += 32 << 3;
+    }
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col[0] + i, in);
+      transpose_16bit_8x8(col[1] + i, in + 8);
+      idct32_135_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_135_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_135_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  highbd_partial_butterfly_sse4_1(in[4], cospi_28_64, cospi_4_64, &step1[4],
+                                  &step1[7]);
+
+  // stage 4
+  highbd_partial_butterfly_sse4_1(in[0], cospi_16_64, cospi_16_64, &step2[1],
+                                  &step2[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[1];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  highbd_butterfly_sse4_1(step2[6], step2[5], cospi_16_64, cospi_16_64,
+                          &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi32(step1[0], step1[7]);
+  out[1] = _mm_add_epi32(step1[1], step1[6]);
+  out[2] = _mm_add_epi32(step1[2], step1[5]);
+  out[3] = _mm_add_epi32(step1[3], step1[4]);
+  out[4] = _mm_sub_epi32(step1[3], step1[4]);
+  out[5] = _mm_sub_epi32(step1[2], step1[5]);
+  out[6] = _mm_sub_epi32(step1[1], step1[6]);
+  out[7] = _mm_sub_epi32(step1[0], step1[7]);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/,
+                                                   __m128i *out /*out[16]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 2
+  highbd_partial_butterfly_sse4_1(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                                  &step2[15]);
+  highbd_partial_butterfly_sse4_1(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                                  &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void highbd_idct32_34_4x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  highbd_idct32_34_4x32_quarter_1(in, temp);
+  highbd_idct32_34_4x32_quarter_2(in, temp);
+  // stage 7
+  highbd_add_sub_butterfly(temp, out, 16);
+}
+
+// For each 4x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void highbd_idct32_34_4x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  highbd_partial_butterfly_sse4_1(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                                  &step1[31]);
+  highbd_partial_butterfly_sse4_1(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                                  &step1[28]);
+
+  highbd_partial_butterfly_sse4_1(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                                  &step1[27]);
+  highbd_partial_butterfly_sse4_1(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                                  &step1[24]);
+
+  // stage 2
+  step2[16] = step1[16];
+  step2[17] = step1[16];
+  step2[18] = step1[19];
+  step2[19] = step1[19];
+  step2[20] = step1[20];
+  step2[21] = step1[20];
+  step2[22] = step1[23];
+  step2[23] = step1[23];
+
+  step2[24] = step1[24];
+  step2[25] = step1[24];
+  step2[26] = step1[27];
+  step2[27] = step1[27];
+  step2[28] = step1[28];
+  step2[29] = step1[28];
+  step2[30] = step1[31];
+  step2[31] = step1[31];
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  highbd_butterfly_sse4_1(step2[30], step2[17], cospi_28_64, cospi_4_64,
+                          &step1[17], &step1[30]);
+  highbd_butterfly_sse4_1(step2[29], step2[18], -cospi_4_64, cospi_28_64,
+                          &step1[18], &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  highbd_butterfly_sse4_1(step2[26], step2[21], cospi_12_64, cospi_20_64,
+                          &step1[21], &step1[26]);
+  highbd_butterfly_sse4_1(step2[25], step2[22], -cospi_20_64, cospi_12_64,
+                          &step1[22], &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  highbd_idct32_4x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static void highbd_idct32_34_4x32(__m128i *const io /*io[32]*/) {
+  __m128i temp[32];
+
+  highbd_idct32_34_4x32_quarter_1_2(io, temp);
+  highbd_idct32_34_4x32_quarter_3_4(io, temp);
+  // final stage
+  highbd_add_sub_butterfly(temp, io, 32);
+}
+
+void vpx_highbd_idct32x32_34_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                        int stride, int bd) {
+  int i, j;
+
+  if (bd == 8) {
+    __m128i col[32], in[32], out[32];
+
+    // rows
+    highbd_load_pack_transpose_32bit_8x8(&input[0], 32, &in[0]);
+    idct32_34_8x32_ssse3(in, col);
+
+    // columns
+    for (i = 0; i < 32; i += 8) {
+      transpose_16bit_8x8(col + i, in);
+      idct32_34_8x32_ssse3(in, out);
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_8(dest + j * stride, out[j], bd);
+      }
+      dest += 8;
+    }
+  } else {
+    __m128i all[8][32], out[32], *in;
+
+    for (i = 0; i < 4; i++) {
+      in = all[i];
+      highbd_load_transpose_32bit_8x4(&input[0], 32, &in[0]);
+      highbd_load_transpose_32bit_8x4(&input[8], 32, &in[8]);
+      highbd_idct32_34_4x32(in);
+      input += 4 * 32;
+    }
+
+    for (i = 0; i < 32; i += 4) {
+      transpose_32bit_4x4(all[0] + i, out + 0);
+      transpose_32bit_4x4(all[1] + i, out + 4);
+      transpose_32bit_4x4(all[2] + i, out + 8);
+      transpose_32bit_4x4(all[3] + i, out + 12);
+      highbd_idct32_34_4x32(out);
+
+      for (j = 0; j < 32; ++j) {
+        highbd_write_buffer_4(dest + j * stride, out[j], bd);
+      }
+      dest += 4;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
index 89a2584e3..2e54d2473 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse2.c
@@ -8,144 +8,152 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE __m128i dct_const_round_shift_4_sse2(const __m128i in0,
+                                                   const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 1
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 2, 3
+  const __m128i t2 = _mm_unpacklo_epi64(t0, t1);    // 0, 1, 2, 3
+  return dct_const_round_shift_sse2(t2);
+}
+
+static INLINE void highbd_idct4_small_sse2(__m128i *const io) {
+  const __m128i cospi_p16_p16 = _mm_setr_epi32(cospi_16_64, 0, cospi_16_64, 0);
+  const __m128i cospi_p08_p08 = _mm_setr_epi32(cospi_8_64, 0, cospi_8_64, 0);
+  const __m128i cospi_p24_p24 = _mm_setr_epi32(cospi_24_64, 0, cospi_24_64, 0);
+  __m128i temp1[4], temp2[4], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // Note: There is no 32-bit signed multiply SIMD instruction in SSE2.
+  //       _mm_mul_epu32() is used which can only guarantee the lower 32-bit
+  //       (signed) result is meaningful, which is enough in this function.
+
+  // stage 1
+  temp1[0] = _mm_add_epi32(io[0], io[2]);             // input[0] + input[2]
+  temp2[0] = _mm_sub_epi32(io[0], io[2]);             // input[0] - input[2]
+  temp1[1] = _mm_srli_si128(temp1[0], 4);             // 1, 3
+  temp2[1] = _mm_srli_si128(temp2[0], 4);             // 1, 3
+  temp1[0] = _mm_mul_epu32(temp1[0], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp1[1] = _mm_mul_epu32(temp1[1], cospi_p16_p16);  // ([0] + [2])*cospi_16_64
+  temp2[0] = _mm_mul_epu32(temp2[0], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  temp2[1] = _mm_mul_epu32(temp2[1], cospi_p16_p16);  // ([0] - [2])*cospi_16_64
+  step[0] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[1] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  temp1[3] = _mm_srli_si128(io[1], 4);
+  temp2[3] = _mm_srli_si128(io[3], 4);
+  temp1[0] = _mm_mul_epu32(io[1], cospi_p24_p24);     // input[1] * cospi_24_64
+  temp1[1] = _mm_mul_epu32(temp1[3], cospi_p24_p24);  // input[1] * cospi_24_64
+  temp2[0] = _mm_mul_epu32(io[1], cospi_p08_p08);     // input[1] * cospi_8_64
+  temp2[1] = _mm_mul_epu32(temp1[3], cospi_p08_p08);  // input[1] * cospi_8_64
+  temp1[2] = _mm_mul_epu32(io[3], cospi_p08_p08);     // input[3] * cospi_8_64
+  temp1[3] = _mm_mul_epu32(temp2[3], cospi_p08_p08);  // input[3] * cospi_8_64
+  temp2[2] = _mm_mul_epu32(io[3], cospi_p24_p24);     // input[3] * cospi_24_64
+  temp2[3] = _mm_mul_epu32(temp2[3], cospi_p24_p24);  // input[3] * cospi_24_64
+  temp1[0] = _mm_sub_epi64(temp1[0], temp1[2]);  // [1]*cospi_24 - [3]*cospi_8
+  temp1[1] = _mm_sub_epi64(temp1[1], temp1[3]);  // [1]*cospi_24 - [3]*cospi_8
+  temp2[0] = _mm_add_epi64(temp2[0], temp2[2]);  // [1]*cospi_8 + [3]*cospi_24
+  temp2[1] = _mm_add_epi64(temp2[1], temp2[3]);  // [1]*cospi_8 + [3]*cospi_24
+  step[2] = dct_const_round_shift_4_sse2(temp1[0], temp1[1]);
+  step[3] = dct_const_round_shift_4_sse2(temp2[0], temp2[1]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+static INLINE void highbd_idct4_large_sse2(__m128i *const io) {
+  __m128i step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  highbd_butterfly_cospi16_sse2(io[0], io[2], &step[0], &step[1]);
+  highbd_butterfly_sse2(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                        &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
 
 void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  int i, j;
-  __m128i inptr[4];
-  __m128i sign_bits[2];
-  __m128i temp_mm, min_input, max_input;
-  int test;
-  int optimised_cols = 0;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i eight = _mm_set1_epi16(8);
-  const __m128i max = _mm_set1_epi16(12043);
-  const __m128i min = _mm_set1_epi16(-12043);
-  // Load input into __m128i
-  inptr[0] = _mm_loadu_si128((const __m128i *)input);
-  inptr[1] = _mm_loadu_si128((const __m128i *)(input + 4));
-  inptr[2] = _mm_loadu_si128((const __m128i *)(input + 8));
-  inptr[3] = _mm_loadu_si128((const __m128i *)(input + 12));
-
-  // Pack to 16 bits
-  inptr[0] = _mm_packs_epi32(inptr[0], inptr[1]);
-  inptr[1] = _mm_packs_epi32(inptr[2], inptr[3]);
-
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp_mm = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp_mm);
-
-  if (!test) {
-    // Do the row transform
-    idct4_sse2(inptr);
-
-    // Check the min & max values
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp_mm = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp_mm);
-
-    if (test) {
-      transpose_16bit_4x4(inptr);
-      sign_bits[0] = _mm_cmplt_epi16(inptr[0], zero);
-      sign_bits[1] = _mm_cmplt_epi16(inptr[1], zero);
-      inptr[3] = _mm_unpackhi_epi16(inptr[1], sign_bits[1]);
-      inptr[2] = _mm_unpacklo_epi16(inptr[1], sign_bits[1]);
-      inptr[1] = _mm_unpackhi_epi16(inptr[0], sign_bits[0]);
-      inptr[0] = _mm_unpacklo_epi16(inptr[0], sign_bits[0]);
-      _mm_storeu_si128((__m128i *)outptr, inptr[0]);
-      _mm_storeu_si128((__m128i *)(outptr + 4), inptr[1]);
-      _mm_storeu_si128((__m128i *)(outptr + 8), inptr[2]);
-      _mm_storeu_si128((__m128i *)(outptr + 12), inptr[3]);
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct4_c(input, outptr, bd);
-      input += 4;
-      outptr += 4;
-    }
+  int16_t max = 0, min = 0;
+  __m128i io[4], io_short[2];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  io_short[0] = _mm_packs_epi32(io[0], io[1]);
+  io_short[1] = _mm_packs_epi32(io[2], io[3]);
+
+  if (bd != 8) {
+    __m128i max_input, min_input;
+
+    max_input = _mm_max_epi16(io_short[0], io_short[1]);
+    min_input = _mm_min_epi16(io_short[0], io_short[1]);
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 8));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 8));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 4));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 4));
+    max_input = _mm_max_epi16(max_input, _mm_srli_si128(max_input, 2));
+    min_input = _mm_min_epi16(min_input, _mm_srli_si128(min_input, 2));
+    max = _mm_extract_epi16(max_input, 0);
+    min = _mm_extract_epi16(min_input, 0);
   }
 
-  if (optimised_cols) {
-    idct4_sse2(inptr);
-
-    // Final round and shift
-    inptr[0] = _mm_add_epi16(inptr[0], eight);
-    inptr[1] = _mm_add_epi16(inptr[1], eight);
-
-    inptr[0] = _mm_srai_epi16(inptr[0], 4);
-    inptr[1] = _mm_srai_epi16(inptr[1], 4);
-
-    // Reconstruction and Store
-    {
-      __m128i d0 = _mm_loadl_epi64((const __m128i *)dest);
-      __m128i d2 = _mm_loadl_epi64((const __m128i *)(dest + stride * 2));
-      d0 = _mm_unpacklo_epi64(
-          d0, _mm_loadl_epi64((const __m128i *)(dest + stride)));
-      d2 = _mm_unpacklo_epi64(
-          d2, _mm_loadl_epi64((const __m128i *)(dest + stride * 3)));
-      d0 = clamp_high_sse2(_mm_adds_epi16(d0, inptr[0]), bd);
-      d2 = clamp_high_sse2(_mm_adds_epi16(d2, inptr[1]), bd);
-      // store input0
-      _mm_storel_epi64((__m128i *)dest, d0);
-      // store input1
-      d0 = _mm_srli_si128(d0, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride), d0);
-      // store input2
-      _mm_storel_epi64((__m128i *)(dest + stride * 2), d2);
-      // store input3
-      d2 = _mm_srli_si128(d2, 8);
-      _mm_storel_epi64((__m128i *)(dest + stride * 3), d2);
-    }
+  if (bd == 8 || (max < 4096 && min >= -4096)) {
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
   } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[4], temp_out[4];
-    // Columns
-    for (i = 0; i < 4; ++i) {
-      for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
-      vpx_highbd_idct4_c(temp_in, temp_out, bd);
-      for (j = 0; j < 4; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
-      }
+    if (max < 32767 && min > -32768) {
+      highbd_idct4_small_sse2(io);
+      highbd_idct4_small_sse2(io);
+    } else {
+      highbd_idct4_large_sse2(io);
+      highbd_idct4_large_sse2(io);
     }
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
   }
+
+  recon_and_store_4x4(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest,
                                    int stride, int bd) {
-  const __m128i zero = _mm_setzero_si128();
-  // Faster than _mm_set1_epi16((1 << bd) - 1).
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
   int a1, i;
   tran_low_t out;
   __m128i dc, d;
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, 4);
   dc = _mm_set1_epi16(a1);
 
   for (i = 0; i < 4; ++i) {
     d = _mm_loadl_epi64((const __m128i *)dest);
-    d = add_dc_clamp(&zero, &max, &dc, &d);
+    d = add_clamp(d, dc, bd);
     _mm_storel_epi64((__m128i *)dest, d);
     dest += stride;
   }
diff --git a/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
new file mode 100644
index 000000000..38e64f3bc
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct4x4_add_sse4.c
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static INLINE void highbd_idct4(__m128i *const io) {
+  __m128i temp[2], step[4];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  temp[0] = _mm_add_epi32(io[0], io[2]);  // input[0] + input[2]
+  extend_64bit(temp[0], temp);
+  step[0] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  temp[0] = _mm_sub_epi32(io[0], io[2]);  // input[0] - input[2]
+  extend_64bit(temp[0], temp);
+  step[1] = multiplication_round_shift_sse4_1(temp, cospi_16_64);
+  highbd_butterfly_sse4_1(io[1], io[3], cospi_24_64, cospi_8_64, &step[2],
+                          &step[3]);
+
+  // stage 2
+  io[0] = _mm_add_epi32(step[0], step[3]);  // step[0] + step[3]
+  io[1] = _mm_add_epi32(step[1], step[2]);  // step[1] + step[2]
+  io[2] = _mm_sub_epi32(step[1], step[2]);  // step[1] - step[2]
+  io[3] = _mm_sub_epi32(step[0], step[3]);  // step[0] - step[3]
+}
+
+void vpx_highbd_idct4x4_16_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[4];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 8));
+  io[3] = _mm_load_si128((const __m128i *)(input + 12));
+
+  if (bd == 8) {
+    __m128i io_short[2];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[1]);
+    io_short[1] = _mm_packs_epi32(io[2], io[3]);
+    idct4_sse2(io_short);
+    idct4_sse2(io_short);
+    io_short[0] = _mm_add_epi16(io_short[0], _mm_set1_epi16(8));
+    io_short[1] = _mm_add_epi16(io_short[1], _mm_set1_epi16(8));
+    io[0] = _mm_srai_epi16(io_short[0], 4);
+    io[1] = _mm_srai_epi16(io_short[1], 4);
+  } else {
+    highbd_idct4(io);
+    highbd_idct4(io);
+    io[0] = wraplow_16bit_shift4(io[0], io[1], _mm_set1_epi32(8));
+    io[1] = wraplow_16bit_shift4(io[2], io[3], _mm_set1_epi32(8));
+  }
+
+  recon_and_store_4x4(io, dest, stride, bd);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
index 29cc1d30e..909a6b794 100644
--- a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse2.c
@@ -8,211 +8,203 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse2(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                        &step1[7]);
+  highbd_butterfly_sse2(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                        &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse2(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse2(step1[1], step1[3], cospi_24_64, cospi_8_64, &step2[2],
+                        &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[4], sign[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  abs_extend_64bit_sse2(io[1], temp1, sign);
+  step1[4] = multiplication_round_shift_sse2(temp1, sign, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse2(temp1, sign, cospi_4_64);
+  abs_extend_64bit_sse2(io[3], temp1, sign);
+  step1[5] = multiplication_neg_round_shift_sse2(temp1, sign, cospi_20_64);
+  step1[6] = multiplication_round_shift_sse2(temp1, sign, cospi_12_64);
+
+  // stage 2
+  abs_extend_64bit_sse2(step1[0], temp1, sign);
+  step2[0] = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  abs_extend_64bit_sse2(step1[1], temp1, sign);
+  step2[2] = multiplication_round_shift_sse2(temp1, sign, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse2(temp1, sign, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse2(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
 
 void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
-
-  // Find the min & max for the row transform
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 8; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      array_transpose_8x8(inptr, inptr);
-      for (i = 0; i < 8; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    idct8_sse2(io_short);
+    idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
   } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 8; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
   }
 
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
-  } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
-  }
+  recon_and_store_8x8(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest,
                                     int stride, int bd) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j, test;
-  __m128i inptr[8];
-  __m128i min_input, max_input, temp1, temp2, sign_bits;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  const __m128i max = _mm_set1_epi16(6201);
-  const __m128i min = _mm_set1_epi16(-6201);
-  int optimised_cols = 0;
-
-  // Load input into __m128i & pack to 16 bits
-  for (i = 0; i < 8; i++) {
-    temp1 = _mm_loadu_si128((const __m128i *)(input + 8 * i));
-    temp2 = _mm_loadu_si128((const __m128i *)(input + 8 * i + 4));
-    inptr[i] = _mm_packs_epi32(temp1, temp2);
-  }
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
 
-  // Find the min & max for the row transform
-  // only first 4 row has non-zero coefs
-  max_input = _mm_max_epi16(inptr[0], inptr[1]);
-  min_input = _mm_min_epi16(inptr[0], inptr[1]);
-  for (i = 2; i < 4; i++) {
-    max_input = _mm_max_epi16(max_input, inptr[i]);
-    min_input = _mm_min_epi16(min_input, inptr[i]);
-  }
-  max_input = _mm_cmpgt_epi16(max_input, max);
-  min_input = _mm_cmplt_epi16(min_input, min);
-  temp1 = _mm_or_si128(max_input, min_input);
-  test = _mm_movemask_epi8(temp1);
-
-  if (!test) {
-    // Do the row transform
-    idct8_sse2(inptr);
-
-    // Find the min & max for the column transform
-    // N.B. Only first 4 cols contain non-zero coeffs
-    max_input = _mm_max_epi16(inptr[0], inptr[1]);
-    min_input = _mm_min_epi16(inptr[0], inptr[1]);
-    for (i = 2; i < 8; i++) {
-      max_input = _mm_max_epi16(max_input, inptr[i]);
-      min_input = _mm_min_epi16(min_input, inptr[i]);
-    }
-    max_input = _mm_cmpgt_epi16(max_input, max);
-    min_input = _mm_cmplt_epi16(min_input, min);
-    temp1 = _mm_or_si128(max_input, min_input);
-    test = _mm_movemask_epi8(temp1);
-
-    if (test) {
-      // Use fact only first 4 rows contain non-zero coeffs
-      array_transpose_4X8(inptr, inptr);
-      for (i = 0; i < 4; i++) {
-        sign_bits = _mm_cmplt_epi16(inptr[i], zero);
-        temp1 = _mm_unpackhi_epi16(inptr[i], sign_bits);
-        temp2 = _mm_unpacklo_epi16(inptr[i], sign_bits);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i + 1)), temp1);
-        _mm_storeu_si128((__m128i *)(outptr + 4 * (2 * i)), temp2);
-      }
-    } else {
-      // Set to use the optimised transform for the column
-      optimised_cols = 1;
-    }
-  } else {
-    // Run the un-optimised row transform
-    for (i = 0; i < 4; ++i) {
-      vpx_highbd_idct8_c(input, outptr, bd);
-      input += 8;
-      outptr += 8;
-    }
-  }
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
 
-  if (optimised_cols) {
-    idct8_sse2(inptr);
-
-    // Final round & shift and Reconstruction and Store
-    {
-      __m128i d[8];
-      for (i = 0; i < 8; i++) {
-        inptr[i] = _mm_add_epi16(inptr[i], sixteen);
-        d[i] = _mm_loadu_si128((const __m128i *)(dest + stride * i));
-        inptr[i] = _mm_srai_epi16(inptr[i], 5);
-        d[i] = clamp_high_sse2(_mm_adds_epi16(d[i], inptr[i]), bd);
-        // Store
-        _mm_storeu_si128((__m128i *)(dest + stride * i), d[i]);
-      }
-    }
+    idct8x8_12_add_kernel_sse2(io_short);
+    round_shift_8x8(io_short, io);
   } else {
-    // Run the un-optimised column transform
-    tran_low_t temp_in[8], temp_out[8];
-    for (i = 0; i < 8; ++i) {
-      for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-      vpx_highbd_idct8_c(temp_in, temp_out, bd);
-      for (j = 0; j < 8; ++j) {
-        dest[j * stride + i] = highbd_clip_pixel_add(
-            dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
-      }
-    }
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
   }
+
+  recon_and_store_8x8(io, dest, stride, bd);
 }
 
 void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest,
diff --git a/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
new file mode 100644
index 000000000..ae391b2c0
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_idct8x8_add_sse4.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+
+static void highbd_idct8x8_half1d(__m128i *const io) {
+  __m128i step1[8], step2[8];
+
+  transpose_32bit_4x4x2(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[2] = io[4];
+  step1[1] = io[2];
+  step1[3] = io[6];
+  highbd_butterfly_sse4_1(io[1], io[7], cospi_28_64, cospi_4_64, &step1[4],
+                          &step1[7]);
+  highbd_butterfly_sse4_1(io[5], io[3], cospi_12_64, cospi_20_64, &step1[5],
+                          &step1[6]);
+
+  // stage 2
+  highbd_butterfly_cospi16_sse4_1(step1[0], step1[2], &step2[0], &step2[1]);
+  highbd_butterfly_sse4_1(step1[1], step1[3], cospi_24_64, cospi_8_64,
+                          &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+static void highbd_idct8x8_12_half1d(__m128i *const io) {
+  __m128i temp1[2], step1[8], step2[8];
+
+  transpose_32bit_4x4(io, io);
+
+  // stage 1
+  step1[0] = io[0];
+  step1[1] = io[2];
+  extend_64bit(io[1], temp1);
+  step1[4] = multiplication_round_shift_sse4_1(temp1, cospi_28_64);
+  step1[7] = multiplication_round_shift_sse4_1(temp1, cospi_4_64);
+  extend_64bit(io[3], temp1);
+  step1[5] = multiplication_round_shift_sse4_1(temp1, -cospi_20_64);
+  step1[6] = multiplication_round_shift_sse4_1(temp1, cospi_12_64);
+
+  // stage 2
+  extend_64bit(step1[0], temp1);
+  step2[0] = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  extend_64bit(step1[1], temp1);
+  step2[2] = multiplication_round_shift_sse4_1(temp1, cospi_24_64);
+  step2[3] = multiplication_round_shift_sse4_1(temp1, cospi_8_64);
+  step2[4] = _mm_add_epi32(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi32(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi32(step1[7], step1[6]);
+  step2[7] = _mm_add_epi32(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi32(step2[0], step2[3]);
+  step1[1] = _mm_add_epi32(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi32(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi32(step2[0], step2[3]);
+  step1[4] = step2[4];
+  highbd_butterfly_cospi16_sse4_1(step2[6], step2[5], &step1[6], &step1[5]);
+  step1[7] = step2[7];
+
+  // stage 4
+  highbd_idct8_stage4(step1, io);
+}
+
+void vpx_highbd_idct8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+  io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], io[4]);
+    io_short[1] = _mm_packs_epi32(io[1], io[5]);
+    io_short[2] = _mm_packs_epi32(io[2], io[6]);
+    io_short[3] = _mm_packs_epi32(io[3], io[7]);
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    io_short[4] = _mm_packs_epi32(io[8], io[12]);
+    io_short[5] = _mm_packs_epi32(io[9], io[13]);
+    io_short[6] = _mm_packs_epi32(io[10], io[14]);
+    io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+    idct8_sse2(io_short);
+    idct8_sse2(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_half1d(io);
+
+    io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+    io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+    io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+    io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+    io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+    io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+    io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+    io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+    highbd_idct8x8_half1d(&io[8]);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    io[4] = io[8];
+    io[5] = io[9];
+    io[6] = io[10];
+    io[7] = io[11];
+    highbd_idct8x8_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
+
+void vpx_highbd_idct8x8_12_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+                                      int stride, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i io[16];
+
+  io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+  io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+  io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+  io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+
+  if (bd == 8) {
+    __m128i io_short[8];
+
+    io_short[0] = _mm_packs_epi32(io[0], zero);
+    io_short[1] = _mm_packs_epi32(io[1], zero);
+    io_short[2] = _mm_packs_epi32(io[2], zero);
+    io_short[3] = _mm_packs_epi32(io[3], zero);
+
+    idct8x8_12_add_kernel_ssse3(io_short);
+    round_shift_8x8(io_short, io);
+  } else {
+    __m128i temp[4];
+
+    highbd_idct8x8_12_half1d(io);
+
+    temp[0] = io[4];
+    temp[1] = io[5];
+    temp[2] = io[6];
+    temp[3] = io[7];
+    highbd_idct8x8_12_half1d(io);
+
+    io[8] = temp[0];
+    io[9] = temp[1];
+    io[10] = temp[2];
+    io[11] = temp[3];
+    highbd_idct8x8_12_half1d(&io[8]);
+
+    highbd_idct8x8_final_round(io);
+  }
+
+  recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
new file mode 100644
index 000000000..2051381aa
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c
@@ -0,0 +1,533 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_16_unpacklo(&dst, stride, &row0);
+    h_store_16_unpacklo(&dst, stride, &row1);
+    h_store_16_unpacklo(&dst, stride, &row2);
+    h_store_16_unpacklo(&dst, stride, &row3);
+    h_store_16_unpackhi(&dst, stride, &row4);
+    h_store_16_unpackhi(&dst, stride, &row5);
+    h_store_16_unpackhi(&dst, stride, &row6);
+    h_store_16_unpackhi(&dst, stride, &row7);
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+void vpx_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+    const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+    const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+    const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+    const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+    const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+    const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+    const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+    const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+    h_store_32_unpacklo(&dst, stride, &row0);
+    h_store_32_unpacklo(&dst, stride, &row1);
+    h_store_32_unpacklo(&dst, stride, &row2);
+    h_store_32_unpacklo(&dst, stride, &row3);
+    h_store_32_unpackhi(&dst, stride, &row4);
+    h_store_32_unpackhi(&dst, stride, &row5);
+    h_store_32_unpackhi(&dst, stride, &row6);
+    h_store_32_unpackhi(&dst, stride, &row7);
+  }
+}
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)left;
+  (void)bd;
+  dc_store_8x8(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_8x8(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 16; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16x16(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16x16(dst, stride, &dc_dup);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
+                                  const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < 32; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void vpx_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32x32(dst, stride, &dc);
+}
+
+void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32x32(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d117_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i IXABCD00 = _mm_srli_si128(KJIXABCD, 4);
+  const __m128i avg2 = _mm_avg_epu16(KJIXABCD, JIXABCD0);
+  const __m128i avg3 = avg3_epu16(&KJIXABCD, &JIXABCD0, &IXABCD00);
+  const __m128i row0 = _mm_srli_si128(avg2, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg2, 4);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+
+  dst -= stride;
+  dst[0] = _mm_extract_epi16(avg3, 1);
+  dst[stride] = _mm_extract_epi16(avg3, 0);
+}
+
+void vpx_highbd_d135_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXABCD = _mm_loadu_si128((const __m128i *)(above - 4));
+  const __m128i KXXXABCD = _mm_insert_epi16(XXXXABCD, K, 0);
+  const __m128i KJXXABCD = _mm_insert_epi16(KXXXABCD, J, 1);
+  const __m128i KJIXABCD = _mm_insert_epi16(KJXXABCD, I, 2);
+  const __m128i JIXABCD0 = _mm_srli_si128(KJIXABCD, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(_mm_slli_si128(KJIXABCD, 2), L, 0);
+  const __m128i avg3 = avg3_epu16(&JIXABCD0, &KJIXABCD, &LKJIXABC);
+  const __m128i row0 = _mm_srli_si128(avg3, 6);
+  const __m128i row1 = _mm_srli_si128(avg3, 4);
+  const __m128i row2 = _mm_srli_si128(avg3, 2);
+  const __m128i row3 = avg3;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d153_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const int I = left[0];
+  const int J = left[1];
+  const int K = left[2];
+  const int L = left[3];
+  const __m128i XXXXXABC = _mm_loadu_si128((const __m128i *)(above - 5));
+  const __m128i LXXXXABC = _mm_insert_epi16(XXXXXABC, L, 0);
+  const __m128i LKXXXABC = _mm_insert_epi16(LXXXXABC, K, 1);
+  const __m128i LKJXXABC = _mm_insert_epi16(LKXXXABC, J, 2);
+  const __m128i LKJIXABC = _mm_insert_epi16(LKJXXABC, I, 3);
+  const __m128i KJIXABC0 = _mm_srli_si128(LKJIXABC, 2);
+  const __m128i JIXABC00 = _mm_srli_si128(LKJIXABC, 4);
+  const __m128i avg3 = avg3_epu16(&LKJIXABC, &KJIXABC0, &JIXABC00);
+  const __m128i avg2 = _mm_avg_epu16(LKJIXABC, KJIXABC0);
+  const __m128i row3 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row2 = _mm_srli_si128(row3, 4);
+  const __m128i row1 = _mm_srli_si128(row3, 8);
+  const __m128i row0 = _mm_srli_si128(avg3, 4);
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst[0] = _mm_extract_epi16(avg2, 3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left);
+  const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff);
+  const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000);
+  const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2);
+  const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4);
+  const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00);
+  const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0);
+  const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i row1 = _mm_srli_si128(row0, 4);
+  const __m128i row2 = _mm_srli_si128(row0, 8);
+  const __m128i row3 = LLLL0000;
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void vpx_highbd_d63_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGH0);
+  const __m128i row0 = avg2;
+  const __m128i row1 = avg3;
+  const __m128i row2 = _mm_srli_si128(avg2, 2);
+  const __m128i row3 = _mm_srli_si128(avg3, 2);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
new file mode 100644
index 000000000..b9dcef205
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c
@@ -0,0 +1,930 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+// -----------------------------------------------------------------------------
+/*
+; ------------------------------------------
+; input: x, y, z, result
+;
+; trick from pascal
+; (x+2y+z+2)>>2 can be calculated as:
+; result = avg(x,z)
+; result -= xor(x,z) & 1
+; result = avg(result,y)
+; ------------------------------------------
+*/
+static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y,
+                                 const __m128i *z) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a = _mm_avg_epu16(*x, *z);
+  const __m128i b =
+      _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one));
+  return _mm_avg_epu16(b, *y);
+}
+
+void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00);
+  (void)left;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, avg3);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4));
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6));
+  dst[3] = above[7];  // aka H
+}
+
+static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
+                               __m128i *row, const __m128i *ar) {
+  *row = _mm_alignr_epi8(*ar, *row, 2);
+  _mm_store_si128((__m128i *)*dst, *row);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3);
+  dst += stride;
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+  d45_store_8(&dst, stride, &avg3, &HHHHHHHH);
+}
+
+static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
+                                __m128i *row_0, __m128i *row_1,
+                                const __m128i *ar) {
+  *row_0 = _mm_alignr_epi8(*row_1, *row_0, 2);
+  *row_1 = _mm_alignr_epi8(*ar, *row_1, 2);
+  _mm_store_si128((__m128i *)*dst, *row_0);
+  _mm_store_si128((__m128i *)(*dst + 8), *row_1);
+  *dst += stride;
+}
+
+void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  dst += stride;
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
+}
+
+void vpx_highbd_d45_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  int i;
+  (void)left;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+  dst += stride;
+  for (i = 1; i < 32; ++i) {
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+  }
+}
+
+DECLARE_ALIGNED(16, static const uint8_t, rotate_right_epu16[16]) = {
+  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1
+};
+
+static INLINE __m128i rotr_epu16(__m128i *a, const __m128i *rotrw) {
+  *a = _mm_shuffle_epi8(*a, *rotrw);
+  return *a;
+}
+
+void vpx_highbd_d117_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i IXABCDEF =
+      _mm_alignr_epi8(XABCDEFG, _mm_slli_si128(IJKLMNOP, 14), 14);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &XABCDEFG, &IXABCDEF);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, XABCDEFG);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i JKLMNOP0 = _mm_srli_si128(IJKLMNOP, 2);
+  __m128i avg3_left = avg3_epu16(&XIJKLMNO, &IJKLMNOP, &JKLMNOP0);
+  __m128i rowa = avg2;
+  __m128i rowb = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; i += 2) {
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, rowb);
+    dst += stride;
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    rowb = _mm_alignr_epi8(rowb, rotr_epu16(&avg3_left, &rotrw), 14);
+  }
+}
+
+void vpx_highbd_d117_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_srli_si128(L1, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      dst += stride;
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d117_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i B0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i B3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B0, _mm_slli_si128(L0, 14), 14);
+  const __m128i C1 = _mm_alignr_epi8(B1, B0, 14);
+  const __m128i C2 = _mm_alignr_epi8(B2, B1, 14);
+  const __m128i C3 = _mm_alignr_epi8(B3, B2, 14);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(B0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(L1, L0, 2);
+  const __m128i L1_ = _mm_alignr_epi8(L2, L1, 2);
+  const __m128i L2_ = _mm_alignr_epi8(L3, L2, 2);
+  const __m128i L3_ = _mm_srli_si128(L3, 2);
+  __m128i rowa_0 = avg2_0;
+  __m128i rowa_1 = avg2_1;
+  __m128i rowa_2 = avg2_2;
+  __m128i rowa_3 = avg2_3;
+  __m128i rowb_0 = avg3_0;
+  __m128i rowb_1 = avg3_1;
+  __m128i rowb_2 = avg3_2;
+  __m128i rowb_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&XL0, &L0, &L0_);
+  avg3_left[1] = avg3_epu16(&XL1, &L1, &L1_);
+  avg3_left[2] = avg3_epu16(&XL2, &L2, &L2_);
+  avg3_left[3] = avg3_epu16(&XL3, &L3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; j += 2) {
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+      _mm_store_si128((__m128i *)dst, rowb_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowb_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowb_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowb_3);
+      dst += stride;
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      rowb_3 = _mm_alignr_epi8(rowb_3, rowb_2, 14);
+      rowb_2 = _mm_alignr_epi8(rowb_2, rowb_1, 14);
+      rowb_1 = _mm_alignr_epi8(rowb_1, rowb_0, 14);
+      rowb_0 = _mm_alignr_epi8(rowb_0, rotr_epu16(&avg_left, &rotrw), 14);
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14);
+  const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0);
+  __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  __m128i rowa = avg3;
+  int i;
+  (void)bd;
+  for (i = 0; i < 8; ++i) {
+    rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14);
+    _mm_store_si128((__m128i *)dst, rowa);
+    dst += stride;
+  }
+}
+
+void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_srli_si128(B1, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i avg3_left[2];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  for (i = 0; i < 2; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d135_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16);
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_load_si128((const __m128i *)above);
+  const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i B2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i B3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i C0 = _mm_alignr_epi8(B1, B0, 2);
+  const __m128i C1 = _mm_alignr_epi8(B2, B1, 2);
+  const __m128i C2 = _mm_alignr_epi8(B3, B2, 2);
+  const __m128i C3 = _mm_srli_si128(B3, 2);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14);
+  const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14);
+  const __m128i L2_ = _mm_alignr_epi8(XL2, XL1, 14);
+  const __m128i L3_ = _mm_alignr_epi8(XL3, XL2, 14);
+  __m128i rowa_0 = avg3_0;
+  __m128i rowa_1 = avg3_1;
+  __m128i rowa_2 = avg3_2;
+  __m128i rowa_3 = avg3_3;
+  __m128i avg3_left[4];
+  int i, j;
+  (void)bd;
+  avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_);
+  avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_);
+  avg3_left[2] = avg3_epu16(&L2, &XL2, &L2_);
+  avg3_left[3] = avg3_epu16(&L3, &XL3, &L3_);
+  for (i = 0; i < 4; ++i) {
+    __m128i avg_left = avg3_left[i];
+    for (j = 0; j < 8; ++j) {
+      rowa_3 = _mm_alignr_epi8(rowa_3, rowa_2, 14);
+      rowa_2 = _mm_alignr_epi8(rowa_2, rowa_1, 14);
+      rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14);
+      rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14);
+      _mm_store_si128((__m128i *)dst, rowa_0);
+      _mm_store_si128((__m128i *)(dst + 8), rowa_1);
+      _mm_store_si128((__m128i *)(dst + 16), rowa_2);
+      _mm_store_si128((__m128i *)(dst + 24), rowa_3);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i ABCDEFG0 = _mm_srli_si128(XABCDEFG, 2);
+  const __m128i BCDEFG00 = _mm_srli_si128(XABCDEFG, 4);
+  const __m128i avg3 = avg3_epu16(&BCDEFG00, &ABCDEFG0, &XABCDEFG);
+  const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left);
+  const __m128i XIJKLMNO =
+      _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14);
+  const __m128i AXIJKLMN =
+      _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(XABCDEFG, 12), 14);
+  const __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN);
+  const __m128i avg2_left = _mm_avg_epu16(IJKLMNOP, XIJKLMNO);
+  const __m128i avg2_avg3_lo = _mm_unpacklo_epi16(avg2_left, avg3_left);
+  const __m128i avg2_avg3_hi = _mm_unpackhi_epi16(avg2_left, avg3_left);
+  const __m128i row0 =
+      _mm_alignr_epi8(avg3, _mm_slli_si128(avg2_avg3_lo, 12), 12);
+  const __m128i row1 =
+      _mm_alignr_epi8(row0, _mm_slli_si128(avg2_avg3_lo, 8), 12);
+  const __m128i row2 =
+      _mm_alignr_epi8(row1, _mm_slli_si128(avg2_avg3_lo, 4), 12);
+  const __m128i row3 = _mm_alignr_epi8(row2, avg2_avg3_lo, 12);
+  const __m128i row4 =
+      _mm_alignr_epi8(row3, _mm_slli_si128(avg2_avg3_hi, 12), 12);
+  const __m128i row5 =
+      _mm_alignr_epi8(row4, _mm_slli_si128(avg2_avg3_hi, 8), 12);
+  const __m128i row6 =
+      _mm_alignr_epi8(row5, _mm_slli_si128(avg2_avg3_hi, 4), 12);
+  const __m128i row7 = _mm_alignr_epi8(row6, avg2_avg3_hi, 12);
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, row0);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row2);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row4);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row5);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row6);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row7);
+}
+
+void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_srli_si128(A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_srli_si128(A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i avg2_avg3_left[2][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+
+  for (j = 0; j < 2; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      dst += stride;
+    }
+  }
+}
+
+void vpx_highbd_d153_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1));
+  const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7));
+  const __m128i A2 = _mm_loadu_si128((const __m128i *)(above + 15));
+  const __m128i A3 = _mm_loadu_si128((const __m128i *)(above + 23));
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_srli_si128(A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_srli_si128(A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i L0 = _mm_load_si128((const __m128i *)left);
+  const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i L2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i L3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14);
+  const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14);
+  const __m128i XL2 = _mm_alignr_epi8(L2, L1, 14);
+  const __m128i XL3 = _mm_alignr_epi8(L3, L2, 14);
+  const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14);
+  const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12);
+  const __m128i AXL2 = _mm_alignr_epi8(L2, L1, 12);
+  const __m128i AXL3 = _mm_alignr_epi8(L3, L2, 12);
+  const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0);
+  const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1);
+  const __m128i avg3_left_2 = avg3_epu16(&L2, &XL2, &AXL2);
+  const __m128i avg3_left_3 = avg3_epu16(&L3, &XL3, &AXL3);
+  const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0);
+  const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1);
+  const __m128i avg2_left_2 = _mm_avg_epu16(L2, XL2);
+  const __m128i avg2_left_3 = _mm_avg_epu16(L3, XL3);
+  __m128i row_0 = avg3_0;
+  __m128i row_1 = avg3_1;
+  __m128i row_2 = avg3_2;
+  __m128i row_3 = avg3_3;
+  __m128i avg2_avg3_left[4][2];
+  int i, j;
+  (void)bd;
+
+  avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0);
+  avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1);
+  avg2_avg3_left[2][0] = _mm_unpacklo_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[2][1] = _mm_unpackhi_epi16(avg2_left_2, avg3_left_2);
+  avg2_avg3_left[3][0] = _mm_unpacklo_epi16(avg2_left_3, avg3_left_3);
+  avg2_avg3_left[3][1] = _mm_unpackhi_epi16(avg2_left_3, avg3_left_3);
+
+  for (j = 0; j < 4; ++j) {
+    for (i = 0; i < 2; ++i) {
+      const __m128i avg2_avg3 = avg2_avg3_left[j][i];
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+      row_3 = _mm_alignr_epi8(row_3, row_2, 12);
+      row_2 = _mm_alignr_epi8(row_2, row_1, 12);
+      row_1 = _mm_alignr_epi8(row_1, row_0, 12);
+      row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12);
+      _mm_store_si128((__m128i *)dst, row_0);
+      _mm_store_si128((__m128i *)(dst + 8), row_1);
+      _mm_store_si128((__m128i *)(dst + 16), row_2);
+      _mm_store_si128((__m128i *)(dst + 24), row_3);
+      dst += stride;
+    }
+  }
+}
+
+static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                  const __m128i *a, const __m128i *b) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3);
+  (void)above;
+  (void)bd;
+  d207_store_4x8(&dst, stride, &out_a, &out_b);
+  d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH);
+}
+
+static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  (void)above;
+  (void)bd;
+  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
+  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
+  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
+  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
+}
+
+static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride,
+                                   const __m128i *a, const __m128i *b,
+                                   const __m128i *c, const __m128i *d,
+                                   const __m128i *e) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  _mm_store_si128((__m128i *)(*dst + 8), *b);
+  _mm_store_si128((__m128i *)(*dst + 16), *c);
+  _mm_store_si128((__m128i *)(*dst + 24), *d);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8));
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12));
+  _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12));
+  _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12));
+  _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12));
+  *dst += stride;
+}
+
+void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)left);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
+  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
+  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
+  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
+  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
+  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
+  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
+  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
+  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
+  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
+  (void)above;
+  (void)bd;
+  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
+  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
+  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
+  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
+  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
+  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
+  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
+}
+
+static INLINE void d63_store_4x8(uint16_t **dst, const ptrdiff_t stride,
+                                 __m128i *a, __m128i *b, const __m128i *ar) {
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+  _mm_store_si128((__m128i *)*dst, *a);
+  *dst += stride;
+  _mm_store_si128((__m128i *)*dst, *b);
+  *dst += stride;
+  *a = _mm_alignr_epi8(*ar, *a, 2);
+  *b = _mm_alignr_epi8(*ar, *b, 2);
+}
+
+void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
+  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
+  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
+  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
+  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
+  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
+  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
+  (void)left;
+  (void)bd;
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
+}
+
+void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 14; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+}
+
+void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i A0 = _mm_load_si128((const __m128i *)above);
+  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
+  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
+  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
+  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
+  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
+  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
+  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
+  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
+  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
+  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
+  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
+  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
+  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
+  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
+  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
+  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
+  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
+  __m128i avg2_2 = _mm_avg_epu16(A2, B2);
+  __m128i avg2_3 = _mm_avg_epu16(A3, B3);
+  int i;
+  (void)left;
+  (void)bd;
+  for (i = 0; i < 30; i += 2) {
+    _mm_store_si128((__m128i *)dst, avg2_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, avg3_0);
+    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+    dst += stride;
+    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
+    avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
+    avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
+    avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
+    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
+    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
+    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
+    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
+  }
+  _mm_store_si128((__m128i *)dst, avg2_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg2_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg2_3);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, avg3_0);
+  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
+  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
+  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index ea100c6e1..e0f749552 100644
--- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -12,59 +12,389 @@
 #define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-static INLINE __m128i add_dc_clamp(const __m128i *const min,
-                                   const __m128i *const max,
-                                   const __m128i *const dc,
-                                   const __m128i *const in) {
-  __m128i out;
-  out = _mm_adds_epi16(*in, *dc);
-  out = _mm_max_epi16(out, *min);
-  out = _mm_min_epi16(out, *max);
-  return out;
+static INLINE void extend_64bit(const __m128i in,
+                                __m128i *const out /*out[2]*/) {
+  out[0] = _mm_unpacklo_epi32(in, in);  // 0, 0, 1, 1
+  out[1] = _mm_unpackhi_epi32(in, in);  // 2, 2, 3, 3
 }
 
-static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
-                                            uint16_t *dest, int stride, int bd,
-                                            const int size) {
-  const __m128i zero = _mm_setzero_si128();
+static INLINE __m128i wraplow_16bit_shift4(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 4);
+  temp[1] = _mm_srai_epi32(temp[1], 4);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i wraplow_16bit_shift5(const __m128i in0, const __m128i in1,
+                                           const __m128i rounding) {
+  __m128i temp[2];
+  temp[0] = _mm_add_epi32(in0, rounding);
+  temp[1] = _mm_add_epi32(in1, rounding);
+  temp[0] = _mm_srai_epi32(temp[0], 5);
+  temp[1] = _mm_srai_epi32(temp[1], 5);
+  return _mm_packs_epi32(temp[0], temp[1]);
+}
+
+static INLINE __m128i dct_const_round_shift_64bit(const __m128i in) {
+  const __m128i t =
+      _mm_add_epi64(in, pair_set_epi32(DCT_CONST_ROUNDING << 2, 0));
+  return _mm_srli_si128(t, 2);
+}
+
+static INLINE __m128i pack_4(const __m128i in0, const __m128i in1) {
+  const __m128i t0 = _mm_unpacklo_epi32(in0, in1);  // 0, 2
+  const __m128i t1 = _mm_unpackhi_epi32(in0, in1);  // 1, 3
+  return _mm_unpacklo_epi32(t0, t1);                // 0, 1, 2, 3
+}
+
+static INLINE void abs_extend_64bit_sse2(const __m128i in,
+                                         __m128i *const out /*out[2]*/,
+                                         __m128i *const sign /*sign[2]*/) {
+  sign[0] = _mm_srai_epi32(in, 31);
+  out[0] = _mm_xor_si128(in, sign[0]);
+  out[0] = _mm_sub_epi32(out[0], sign[0]);
+  sign[1] = _mm_unpackhi_epi32(sign[0], sign[0]);  // 64-bit sign of 2, 3
+  sign[0] = _mm_unpacklo_epi32(sign[0], sign[0]);  // 64-bit sign of 0, 1
+  out[1] = _mm_unpackhi_epi32(out[0], out[0]);     // 2, 3
+  out[0] = _mm_unpacklo_epi32(out[0], out[0]);     // 0, 1
+}
+
+// Note: cospi must be non negative.
+static INLINE __m128i multiply_apply_sign_sse2(const __m128i in,
+                                               const __m128i sign,
+                                               const __m128i cospi) {
+  __m128i out = _mm_mul_epu32(in, cospi);
+  out = _mm_xor_si128(out, sign);
+  return _mm_sub_epi64(out, sign);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c must be non negative.
+static INLINE __m128i multiplication_neg_round_shift_sse2(
+    const __m128i *const in /*in[2]*/, const __m128i *const sign /*sign[2]*/,
+    const int c) {
+  const __m128i pair_c = pair_set_epi32(c << 2, 0);
+  __m128i t0, t1;
+
+  assert(c >= 0);
+  t0 = multiply_apply_sign_sse2(in[0], sign[0], pair_c);
+  t1 = multiply_apply_sign_sse2(in[1], sign[1], pair_c);
+  t0 = _mm_sub_epi64(_mm_setzero_si128(), t0);
+  t1 = _mm_sub_epi64(_mm_setzero_si128(), t1);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_butterfly_sse2(const __m128i in0, const __m128i in1,
+                                         const int c0, const int c1,
+                                         __m128i *const out0,
+                                         __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(c0 << 2, 0);
+  const __m128i pair_c1 = pair_set_epi32(c1 << 2, 0);
+  __m128i temp1[4], temp2[4], sign1[2], sign2[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in0, temp1, sign1);
+  abs_extend_64bit_sse2(in1, temp2, sign2);
+  temp1[2] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c1);
+  temp1[3] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c1);
+  temp1[0] = multiply_apply_sign_sse2(temp1[0], sign1[0], pair_c0);
+  temp1[1] = multiply_apply_sign_sse2(temp1[1], sign1[1], pair_c0);
+  temp2[2] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c0);
+  temp2[3] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c0);
+  temp2[0] = multiply_apply_sign_sse2(temp2[0], sign2[0], pair_c1);
+  temp2[1] = multiply_apply_sign_sse2(temp2[1], sign2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_sse2(const __m128i in, const int c0,
+                                                 const int c1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_round_shift_sse2(temp, sign, c0);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c1);
+}
+
+// Note: c0 and c1 must be non negative.
+static INLINE void highbd_partial_butterfly_neg_sse2(const __m128i in,
+                                                     const int c0, const int c1,
+                                                     __m128i *const out0,
+                                                     __m128i *const out1) {
+  __m128i temp[2], sign[2];
+
+  assert(c0 >= 0);
+  assert(c1 >= 0);
+  abs_extend_64bit_sse2(in, temp, sign);
+  *out0 = multiplication_neg_round_shift_sse2(temp, sign, c1);
+  *out1 = multiplication_round_shift_sse2(temp, sign, c0);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse2(const __m128i in0,
+                                                 const __m128i in1,
+                                                 __m128i *const out0,
+                                                 __m128i *const out1) {
+  __m128i temp1[2], temp2, sign[2];
+
+  temp2 = _mm_add_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out0 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  abs_extend_64bit_sse2(temp2, temp1, sign);
+  *out1 = multiplication_round_shift_sse2(temp1, sign, cospi_16_64);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void highbd_add_sub_butterfly(const __m128i *in, __m128i *out,
+                                            int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi32(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi32(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void highbd_idct8_stage4(const __m128i *const in,
+                                       __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[7]);
+  out[1] = _mm_add_epi32(in[1], in[6]);
+  out[2] = _mm_add_epi32(in[2], in[5]);
+  out[3] = _mm_add_epi32(in[3], in[4]);
+  out[4] = _mm_sub_epi32(in[3], in[4]);
+  out[5] = _mm_sub_epi32(in[2], in[5]);
+  out[6] = _mm_sub_epi32(in[1], in[6]);
+  out[7] = _mm_sub_epi32(in[0], in[7]);
+}
+
+static INLINE void highbd_idct8x8_final_round(__m128i *const io) {
+  io[0] = wraplow_16bit_shift5(io[0], io[8], _mm_set1_epi32(16));
+  io[1] = wraplow_16bit_shift5(io[1], io[9], _mm_set1_epi32(16));
+  io[2] = wraplow_16bit_shift5(io[2], io[10], _mm_set1_epi32(16));
+  io[3] = wraplow_16bit_shift5(io[3], io[11], _mm_set1_epi32(16));
+  io[4] = wraplow_16bit_shift5(io[4], io[12], _mm_set1_epi32(16));
+  io[5] = wraplow_16bit_shift5(io[5], io[13], _mm_set1_epi32(16));
+  io[6] = wraplow_16bit_shift5(io[6], io[14], _mm_set1_epi32(16));
+  io[7] = wraplow_16bit_shift5(io[7], io[15], _mm_set1_epi32(16));
+}
+
+static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
+                                             __m128i *const out) {
+  out[0] = _mm_add_epi32(in[0], in[15]);
+  out[1] = _mm_add_epi32(in[1], in[14]);
+  out[2] = _mm_add_epi32(in[2], in[13]);
+  out[3] = _mm_add_epi32(in[3], in[12]);
+  out[4] = _mm_add_epi32(in[4], in[11]);
+  out[5] = _mm_add_epi32(in[5], in[10]);
+  out[6] = _mm_add_epi32(in[6], in[9]);
+  out[7] = _mm_add_epi32(in[7], in[8]);
+  out[8] = _mm_sub_epi32(in[7], in[8]);
+  out[9] = _mm_sub_epi32(in[6], in[9]);
+  out[10] = _mm_sub_epi32(in[5], in[10]);
+  out[11] = _mm_sub_epi32(in[4], in[11]);
+  out[12] = _mm_sub_epi32(in[3], in[12]);
+  out[13] = _mm_sub_epi32(in[2], in[13]);
+  out[14] = _mm_sub_epi32(in[1], in[14]);
+  out[15] = _mm_sub_epi32(in[0], in[15]);
+}
+
+static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
+                                const int bd) {
+  const __m128i zero = _mm_set1_epi16(0);
   // Faster than _mm_set1_epi16((1 << bd) - 1).
   const __m128i one = _mm_set1_epi16(1);
   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i d;
+
+  d = _mm_adds_epi16(in0, in1);
+  d = _mm_max_epi16(d, zero);
+  d = _mm_min_epi16(d, max);
+
+  return d;
+}
+
+static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input,
+                                            uint16_t *dest, int stride, int bd,
+                                            const int size) {
   int a1, i, j;
   tran_low_t out;
   __m128i dc, d;
 
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
-  out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
+  out = HIGHBD_WRAPLOW(
+      dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd);
+  out =
+      HIGHBD_WRAPLOW(dct_const_round_shift(out * (tran_high_t)cospi_16_64), bd);
   a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6);
   dc = _mm_set1_epi16(a1);
 
   for (i = 0; i < size; ++i) {
-    for (j = 0; j < (size >> 3); ++j) {
-      d = _mm_load_si128((const __m128i *)(&dest[j * 8]));
-      d = add_dc_clamp(&zero, &max, &dc, &d);
-      _mm_store_si128((__m128i *)(&dest[j * 8]), d);
+    for (j = 0; j < size; j += 8) {
+      d = _mm_load_si128((const __m128i *)(&dest[j]));
+      d = add_clamp(d, dc, bd);
+      _mm_store_si128((__m128i *)(&dest[j]), d);
     }
     dest += stride;
   }
 }
 
-static INLINE __m128i clamp_high_sse2(__m128i value, int bd) {
-  __m128i ubounded, retval;
-  const __m128i zero = _mm_set1_epi16(0);
-  const __m128i one = _mm_set1_epi16(1);
-  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
-  ubounded = _mm_cmpgt_epi16(value, max);
-  retval = _mm_andnot_si128(ubounded, value);
-  ubounded = _mm_and_si128(ubounded, max);
-  retval = _mm_or_si128(retval, ubounded);
-  retval = _mm_and_si128(retval, _mm_cmpgt_epi16(retval, zero));
-  return retval;
+static INLINE void recon_and_store_4(const __m128i in, uint16_t *const dest,
+                                     const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)dest);
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)dest, d);
+}
+
+static INLINE void recon_and_store_4x2(const __m128i in, uint16_t *const dest,
+                                       const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
+  d = _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(d), (const __m64 *)(dest + 1 * stride)));
+  d = add_clamp(d, in, bd);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d));
+}
+
+static INLINE void recon_and_store_4x4(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_4x2(in[0], dest, stride, bd);
+  dest += 2 * stride;
+  recon_and_store_4x2(in[1], dest, stride, bd);
+}
+
+static INLINE void recon_and_store_8(const __m128i in, uint16_t **const dest,
+                                     const int stride, const int bd) {
+  __m128i d;
+
+  d = _mm_load_si128((const __m128i *)(*dest));
+  d = add_clamp(d, in, bd);
+  _mm_store_si128((__m128i *)(*dest), d);
+  *dest += stride;
+}
+
+static INLINE void recon_and_store_8x8(const __m128i *const in, uint16_t *dest,
+                                       const int stride, const int bd) {
+  recon_and_store_8(in[0], &dest, stride, bd);
+  recon_and_store_8(in[1], &dest, stride, bd);
+  recon_and_store_8(in[2], &dest, stride, bd);
+  recon_and_store_8(in[3], &dest, stride, bd);
+  recon_and_store_8(in[4], &dest, stride, bd);
+  recon_and_store_8(in[5], &dest, stride, bd);
+  recon_and_store_8(in[6], &dest, stride, bd);
+  recon_and_store_8(in[7], &dest, stride, bd);
+}
+
+static INLINE __m128i load_pack_8_32bit(const tran_low_t *const input) {
+  const __m128i t0 = _mm_load_si128((const __m128i *)(input + 0));
+  const __m128i t1 = _mm_load_si128((const __m128i *)(input + 4));
+  return _mm_packs_epi32(t0, t1);
+}
+
+static INLINE void highbd_load_pack_transpose_32bit_8x8(const tran_low_t *input,
+                                                        const int stride,
+                                                        __m128i *const in) {
+  in[0] = load_pack_8_32bit(input + 0 * stride);
+  in[1] = load_pack_8_32bit(input + 1 * stride);
+  in[2] = load_pack_8_32bit(input + 2 * stride);
+  in[3] = load_pack_8_32bit(input + 3 * stride);
+  in[4] = load_pack_8_32bit(input + 4 * stride);
+  in[5] = load_pack_8_32bit(input + 5 * stride);
+  in[6] = load_pack_8_32bit(input + 6 * stride);
+  in[7] = load_pack_8_32bit(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_8x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride + 0));
+  in[1] = _mm_load_si128((const __m128i *)(input + 0 * stride + 4));
+  in[2] = _mm_load_si128((const __m128i *)(input + 1 * stride + 0));
+  in[3] = _mm_load_si128((const __m128i *)(input + 1 * stride + 4));
+  in[4] = _mm_load_si128((const __m128i *)(input + 2 * stride + 0));
+  in[5] = _mm_load_si128((const __m128i *)(input + 2 * stride + 4));
+  in[6] = _mm_load_si128((const __m128i *)(input + 3 * stride + 0));
+  in[7] = _mm_load_si128((const __m128i *)(input + 3 * stride + 4));
+  transpose_32bit_8x4(in, in);
+}
+
+static INLINE void highbd_load_transpose_32bit_4x4(const tran_low_t *input,
+                                                   const int stride,
+                                                   __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  transpose_32bit_4x4(in, in);
+}
+
+static INLINE void highbd_write_buffer_8(uint16_t *dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store_8(out, &dest, 0, bd);
+}
+
+static INLINE void highbd_write_buffer_4(uint16_t *const dest, const __m128i in,
+                                         const int bd) {
+  const __m128i final_rounding = _mm_set1_epi32(1 << 5);
+  __m128i out;
+
+  out = _mm_add_epi32(in, final_rounding);
+  out = _mm_srai_epi32(out, 6);
+  out = _mm_packs_epi32(out, out);
+  recon_and_store_4(out, dest, bd);
 }
 
 #endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
new file mode 100644
index 000000000..9c8eef40f
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse4.h
@@ -0,0 +1,87 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+#define VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
+
+#include <smmintrin.h>  // SSE4.1
+
+#include "./vpx_config.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h"
+
+static INLINE __m128i multiplication_round_shift_sse4_1(
+    const __m128i *const in /*in[2]*/, const int c) {
+  const __m128i pair_c = pair_set_epi32(c * 4, 0);
+  __m128i t0, t1;
+
+  t0 = _mm_mul_epi32(in[0], pair_c);
+  t1 = _mm_mul_epi32(in[1], pair_c);
+  t0 = dct_const_round_shift_64bit(t0);
+  t1 = dct_const_round_shift_64bit(t1);
+
+  return pack_4(t0, t1);
+}
+
+static INLINE void highbd_butterfly_sse4_1(const __m128i in0, const __m128i in1,
+                                           const int c0, const int c1,
+                                           __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+  const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+  __m128i temp1[4], temp2[4];
+
+  extend_64bit(in0, temp1);
+  extend_64bit(in1, temp2);
+  temp1[2] = _mm_mul_epi32(temp1[0], pair_c1);
+  temp1[3] = _mm_mul_epi32(temp1[1], pair_c1);
+  temp1[0] = _mm_mul_epi32(temp1[0], pair_c0);
+  temp1[1] = _mm_mul_epi32(temp1[1], pair_c0);
+  temp2[2] = _mm_mul_epi32(temp2[0], pair_c0);
+  temp2[3] = _mm_mul_epi32(temp2[1], pair_c0);
+  temp2[0] = _mm_mul_epi32(temp2[0], pair_c1);
+  temp2[1] = _mm_mul_epi32(temp2[1], pair_c1);
+  temp1[0] = _mm_sub_epi64(temp1[0], temp2[0]);
+  temp1[1] = _mm_sub_epi64(temp1[1], temp2[1]);
+  temp2[0] = _mm_add_epi64(temp1[2], temp2[2]);
+  temp2[1] = _mm_add_epi64(temp1[3], temp2[3]);
+  temp1[0] = dct_const_round_shift_64bit(temp1[0]);
+  temp1[1] = dct_const_round_shift_64bit(temp1[1]);
+  temp2[0] = dct_const_round_shift_64bit(temp2[0]);
+  temp2[1] = dct_const_round_shift_64bit(temp2[1]);
+  *out0 = pack_4(temp1[0], temp1[1]);
+  *out1 = pack_4(temp2[0], temp2[1]);
+}
+
+static INLINE void highbd_butterfly_cospi16_sse4_1(const __m128i in0,
+                                                   const __m128i in1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp1[2], temp2;
+
+  temp2 = _mm_add_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out0 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+  temp2 = _mm_sub_epi32(in0, in1);
+  extend_64bit(temp2, temp1);
+  *out1 = multiplication_round_shift_sse4_1(temp1, cospi_16_64);
+}
+
+static INLINE void highbd_partial_butterfly_sse4_1(const __m128i in,
+                                                   const int c0, const int c1,
+                                                   __m128i *const out0,
+                                                   __m128i *const out1) {
+  __m128i temp[2];
+
+  extend_64bit(in, temp);
+  *out0 = multiplication_round_shift_sse4_1(temp, c0);
+  *out1 = multiplication_round_shift_sse4_1(temp, c1);
+}
+
+#endif  // VPX_DSP_X86_HIGHBD_INV_TXFM_SSE4_H_
diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index 8670b2895..ec22db9f4 100644
--- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -12,7 +12,6 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
 
 static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i ubounded;
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 2362476c1..cedf98aff 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -37,54 +38,54 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
   (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = ((int)count / 4) - 1; i >= 0; i--) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (test == 0xffff)
-        non_zero_regs--;
-      else
-        break;
-    }
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
 
-    // Quantization pass:
-    for (i = 0; i < non_zero_regs; i++) {
-      __m128i coeffs, coeffs_sign, tmp1, tmp2;
-      int test;
-      int abs_coeff[4];
-      int coeff_sign[4];
-
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      coeffs_sign = _mm_srai_epi32(coeffs, 31);
-      coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-      tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-      tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-      tmp1 = _mm_or_si128(tmp1, tmp2);
-      test = _mm_movemask_epi8(tmp1);
-      _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-      _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-      for (j = 0; j < 4; j++) {
-        if (test & (1 << (4 * j))) {
-          int k = 4 * i + j;
-          const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-          const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-          const uint32_t abs_qcoeff =
-              (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-          qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-          dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-          if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-        }
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
@@ -105,6 +106,9 @@ void vpx_highbd_quantize_b_32x32_sse2(
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
+  (void)skip_block;
+  assert(!skip_block);
+
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
 
@@ -116,38 +120,35 @@ void vpx_highbd_quantize_b_32x32_sse2(
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    // Pre-scan pass
-    for (i = 0; i < n_coeffs / 4; i++) {
-      __m128i coeffs, cmp1, cmp2;
-      int test;
-      coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-      cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-      cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-      cmp1 = _mm_and_si128(cmp1, cmp2);
-      test = _mm_movemask_epi8(cmp1);
-      if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-      if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-      if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-      if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-    }
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
 
-    // Quantization pass: only process the coefficients selected in
-    // pre-scan pass. Note: idx can be zero.
-    for (i = 0; i < idx; i++) {
-      const int rc = idx_arr[i];
-      const int coeff = coeff_ptr[rc];
-      const int coeff_sign = (coeff >> 31);
-      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-      const int64_t tmp1 =
-          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-      const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-      const uint32_t abs_qcoeff =
-          (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-      qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-      if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-    }
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob + 1;
 }
diff --git a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 30ee81b68..d9a6932e0 100644
--- a/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
+++ b/libvpx/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -72,7 +72,7 @@ SECTION .text
   paddd                m6, m4
   mov                  r1, ssem         ; r1 = unsigned int *sse
   movd               [r1], m7           ; store sse
-  movd                rax, m6           ; store sum as return value
+  movd                eax, m6           ; store sum as return value
 %endif
   RET
 %endmacro
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 923418a99..e646767e1 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
+++ b/libvpx/vpx_dsp/x86/highbd_variance_impl_sse2.asm
@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+SECTION .text
+
 ;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
diff --git a/libvpx/vpx_dsp/x86/intrapred_sse2.asm b/libvpx/vpx_dsp/x86/intrapred_sse2.asm
index c18095c28..61af6236e 100644
--- a/libvpx/vpx_dsp/x86/intrapred_sse2.asm
+++ b/libvpx/vpx_dsp/x86/intrapred_sse2.asm
@@ -61,7 +61,7 @@ cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset
   psrlq                m3, 8
   movd   [dstq+strideq  ], m3
   psrlq                m0, 56
-  movd              tempq, m0
+  movd              tempd, m0
   mov    [dstq+strideq+3], tempb
 
   RESTORE_GOT
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index f75dab07a..f6e56b6f9 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -8,19 +8,29 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
+static INLINE void transpose_16bit_4(__m128i *res) {
+  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+}
+
 void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
   const __m128i eight = _mm_set1_epi16(8);
   __m128i in[2];
 
   // Rows
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8);
+  in[0] = load_input_data8(input);
+  in[1] = load_input_data8(input + 8);
   idct4_sse2(in);
 
   // Columns
@@ -41,7 +51,7 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   int a;
   __m128i dc_value, d[2];
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
+  a = (int)dct_const_round_shift((int16_t)input[0] * cospi_16_64);
   a = (int)dct_const_round_shift(a * cospi_16_64);
   a = ROUND_POWER_OF_TWO(a, 4);
 
@@ -69,35 +79,19 @@ void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest,
   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
 }
 
-void idct4_sse2(__m128i *in) {
+void idct4_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i u[8], v[8];
+  __m128i u[2];
 
-  transpose_16bit_4x4(in);
+  transpose_16bit_4(in);
   // stage 1
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpackhi_epi16(in[0], in[1]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
-  u[0] = _mm_packs_epi32(v[0], v[1]);
-  u[1] = _mm_packs_epi32(v[3], v[2]);
+  u[0] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_p16_m16, u[0]);
+  u[1] = idct_calc_wraplow_sse2(k__cospi_p08_p24, k__cospi_p24_m08, u[1]);
 
   // stage 2
   in[0] = _mm_add_epi16(u[0], u[1]);
@@ -105,7 +99,7 @@ void idct4_sse2(__m128i *in) {
   in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
-void iadst4_sse2(__m128i *in) {
+void iadst4_sse2(__m128i *const in) {
   const __m128i k__sinpi_p01_p04 = pair_set_epi16(sinpi_1_9, sinpi_4_9);
   const __m128i k__sinpi_p03_p02 = pair_set_epi16(sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p02_m01 = pair_set_epi16(sinpi_2_9, -sinpi_1_9);
@@ -115,7 +109,7 @@ void iadst4_sse2(__m128i *in) {
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8], in7;
 
-  transpose_16bit_4x4(in);
+  transpose_16bit_4(in);
   in7 = _mm_srli_si128(in[1], 8);
   in7 = _mm_add_epi16(in7, in[0]);
   in7 = _mm_sub_epi16(in7, in[1]);
@@ -154,215 +148,93 @@ void iadst4_sse2(__m128i *in) {
   in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
-#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
-  {                                                                  \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                               \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                               \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                               \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                               \
-                                                                     \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                            \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                            \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                            \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                            \
-                                                                     \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                     \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                     \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                     \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                     \
-                                                                     \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                              \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                              \
-  }
-
-#define IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, \
-              out4, out5, out6, out7)                                         \
-  {                                                                           \
-    /* Stage1 */                                                              \
-    {                                                                         \
-      const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);                     \
-      const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);                     \
-      const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);                     \
-      const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, stg1_1,      \
-                             stg1_2, stg1_3, stp1_4, stp1_7, stp1_5, stp1_6)  \
-    }                                                                         \
-                                                                              \
-    /* Stage2 */                                                              \
-    {                                                                         \
-      const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4);                     \
-      const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4);                     \
-      const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);                     \
-      const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);                     \
-                                                                              \
-      MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1,      \
-                             stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3)  \
-                                                                              \
-      stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                 \
-      stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                 \
-      stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                 \
-      stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                 \
-    }                                                                         \
-                                                                              \
-    /* Stage3 */                                                              \
-    {                                                                         \
-      const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5);               \
-      const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5);               \
-                                                                              \
-      stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                 \
-      stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                 \
-      stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                 \
-      stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                 \
-                                                                              \
-      tmp0 = _mm_madd_epi16(lo_56, stg2_1);                                   \
-      tmp1 = _mm_madd_epi16(hi_56, stg2_1);                                   \
-      tmp2 = _mm_madd_epi16(lo_56, stg2_0);                                   \
-      tmp3 = _mm_madd_epi16(hi_56, stg2_0);                                   \
-                                                                              \
-      tmp0 = _mm_add_epi32(tmp0, rounding);                                   \
-      tmp1 = _mm_add_epi32(tmp1, rounding);                                   \
-      tmp2 = _mm_add_epi32(tmp2, rounding);                                   \
-      tmp3 = _mm_add_epi32(tmp3, rounding);                                   \
-                                                                              \
-      tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                            \
-      tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                            \
-      tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                            \
-      tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                            \
-                                                                              \
-      stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                   \
-      stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                   \
-    }                                                                         \
-                                                                              \
-    /* Stage4  */                                                             \
-    out0 = _mm_add_epi16(stp1_0, stp2_7);                                     \
-    out1 = _mm_add_epi16(stp1_1, stp1_6);                                     \
-    out2 = _mm_add_epi16(stp1_2, stp1_5);                                     \
-    out3 = _mm_add_epi16(stp1_3, stp2_4);                                     \
-    out4 = _mm_sub_epi16(stp1_3, stp2_4);                                     \
-    out5 = _mm_sub_epi16(stp1_2, stp1_5);                                     \
-    out6 = _mm_sub_epi16(stp1_1, stp1_6);                                     \
-    out7 = _mm_sub_epi16(stp1_0, stp2_7);                                     \
-  }
+static INLINE void load_buffer_8x8(const tran_low_t *const input,
+                                   __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 8);
+  in[1] = load_input_data8(input + 1 * 8);
+  in[2] = load_input_data8(input + 2 * 8);
+  in[3] = load_input_data8(input + 3 * 8);
+  in[4] = load_input_data8(input + 4 * 8);
+  in[5] = load_input_data8(input + 5 * 8);
+  in[6] = load_input_data8(input + 6 * 8);
+  in[7] = load_input_data8(input + 7 * 8);
+}
 
 void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest,
                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in[8];
   int i;
 
   // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
+  load_buffer_8x8(input, in);
 
   // 2-D
   for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5,
-          in6, in7);
+    idct8_sse2(in);
   }
 
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+  write_buffer_8x8(in, dest, stride);
 }
 
-void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 5);
+void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  __m128i io[8];
 
-  dc_value = _mm_set1_epi16(a);
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
 
-  RECON_AND_STORE(dest + 0 * stride, dc_value);
-  RECON_AND_STORE(dest + 1 * stride, dc_value);
-  RECON_AND_STORE(dest + 2 * stride, dc_value);
-  RECON_AND_STORE(dest + 3 * stride, dc_value);
-  RECON_AND_STORE(dest + 4 * stride, dc_value);
-  RECON_AND_STORE(dest + 5 * stride, dc_value);
-  RECON_AND_STORE(dest + 6 * stride, dc_value);
-  RECON_AND_STORE(dest + 7 * stride, dc_value);
+  idct8x8_12_add_kernel_sse2(io);
+  write_buffer_8x8(io, dest, stride);
 }
 
-void idct8_sse2(__m128i *in) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+static INLINE void recon_and_store_8_dual(uint8_t *const dest,
+                                          const __m128i in_x,
+                                          const int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_loadl_epi64((__m128i *)(dest + 0 * stride));
+  d1 = _mm_loadl_epi64((__m128i *)(dest + 1 * stride));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d1 = _mm_unpacklo_epi8(d1, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_storel_epi64((__m128i *)(dest + 0 * stride), d0);
+  _mm_storeh_pi((__m64 *)(dest + 1 * stride), _mm_castsi128_ps(d0));
+}
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  __m128i dc_value;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  dc_value = _mm_set1_epi16((int16_t)a1);
+
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+  dest += 2 * stride;
+  recon_and_store_8_dual(dest, dc_value, stride);
+}
 
+void idct8_sse2(__m128i *const in) {
   // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-  TRANSPOSE_8X8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], in0,
-                in1, in2, in3, in4, in5, in6, in7);
+  transpose_16bit_8x8(in, in);
 
   // 4-stage 1D idct8x8
-  IDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in[0], in[1], in[2], in[3],
-        in[4], in[5], in[6], in[7]);
+  idct8(in, in);
 }
 
-void iadst8_sse2(__m128i *in) {
+void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
   const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
   const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
@@ -375,7 +247,7 @@ void iadst8_sse2(__m128i *in) {
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__const_0 = _mm_set1_epi16(0);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
@@ -386,7 +258,7 @@ void iadst8_sse2(__m128i *in) {
   __m128i in0, in1, in2, in3, in4, in5, in6, in7;
 
   // transpose
-  array_transpose_8x8(in, in);
+  transpose_16bit_8x8(in, in);
 
   // properly aligned for butterfly input
   in0 = in[7];
@@ -548,37 +420,10 @@ void iadst8_sse2(__m128i *in) {
   u2 = _mm_unpacklo_epi16(s6, s7);
   u3 = _mm_unpackhi_epi16(s6, s7);
 
-  v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
-  v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-  v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
-  v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
-  v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
-  v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
-  v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
-  v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
-  u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
-  u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
-  u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
-  u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-  u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
-  u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
-  u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
-  u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
-  v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
-  v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
-  v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
-  v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-  v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
-  v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
-  v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
-  v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
-  s2 = _mm_packs_epi32(v0, v1);
-  s3 = _mm_packs_epi32(v2, v3);
-  s6 = _mm_packs_epi32(v4, v5);
-  s7 = _mm_packs_epi32(v6, v7);
+  s2 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_p16);
+  s3 = idct_calc_wraplow_sse2(u0, u1, k__cospi_p16_m16);
+  s6 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_p16);
+  s7 = idct_calc_wraplow_sse2(u2, u3, k__cospi_p16_m16);
 
   in[0] = s0;
   in[1] = _mm_sub_epi16(k__const_0, s4);
@@ -590,521 +435,133 @@ void iadst8_sse2(__m128i *in) {
   in[7] = _mm_sub_epi16(k__const_0, s1);
 }
 
-void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+static INLINE void idct16_load8x8(const tran_low_t *const input,
+                                  __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * 16);
+  in[1] = load_input_data8(input + 1 * 16);
+  in[2] = load_input_data8(input + 2 * 16);
+  in[3] = load_input_data8(input + 3 * 16);
+  in[4] = load_input_data8(input + 4 * 16);
+  in[5] = load_input_data8(input + 5 * 16);
+  in[6] = load_input_data8(input + 6 * 16);
+  in[7] = load_input_data8(input + 7 * 16);
+}
 
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-  // Stage1
-  {
-    const __m128i lo_17 = _mm_unpackhi_epi16(in0, zero);
-    const __m128i lo_35 = _mm_unpackhi_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-    tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-    tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-    tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
-    stp1_5 = _mm_packs_epi32(tmp4, tmp6);
-  }
+void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i l[16], r[16], out[16], *in;
+  int i;
 
-  // Stage2
-  {
-    const __m128i lo_04 = _mm_unpacklo_epi16(in0, zero);
-    const __m128i lo_26 = _mm_unpacklo_epi16(in1, zero);
-
-    tmp0 = _mm_madd_epi16(lo_04, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_04, stg2_1);
-    tmp4 = _mm_madd_epi16(lo_26, stg2_2);
-    tmp6 = _mm_madd_epi16(lo_26, stg2_3);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp2_0 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_2 = _mm_packs_epi32(tmp6, tmp4);
-
-    tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-    tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-    stp2_4 = tmp0;
-    stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+  in = l;
+  for (i = 0; i < 2; i++) {
+    idct16_load8x8(input, in);
+    transpose_16bit_8x8(in, in);
+    idct16_load8x8(input + 8, in + 8);
+    transpose_16bit_8x8(in + 8, in + 8);
+    idct16_8col(in, in);
+    in = r;
+    input += 128;
   }
 
-  // Stage3
-  {
-    const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    transpose_16bit_8x8(l + i, out);
+    transpose_16bit_8x8(r + i, out + 8);
+    idct16_8col(out, out);
 
-    tmp4 = _mm_add_epi16(stp2_0, stp2_2);
-    tmp6 = _mm_sub_epi16(stp2_0, stp2_2);
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, out[j]);
+    }
 
-    stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4);
-    stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4);
+    dest += 8;
+  }
+}
 
-    tmp0 = _mm_madd_epi16(lo_56, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_56, stg2_0);  // stg3_1 = stg2_0
+void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], temp[16], out[16];
+  int i;
 
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+  idct16_load8x8(input, in);
+  transpose_16bit_8x8(in, in);
 
-    stp1_5 = _mm_packs_epi32(tmp0, tmp2);
+  for (i = 8; i < 16; i++) {
+    in[i] = _mm_setzero_si128();
   }
+  idct16_8col(in, temp);
 
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  IDCT8(in0, in1, in2, in3, zero, zero, zero, zero, in0, in1, in2, in3, in4,
-        in5, in6, in7);
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
+  for (i = 0; i < 16; i += 8) {
+    int j;
+    transpose_16bit_8x8(temp + i, in);
+    idct16_8col(in, out);
 
-#define IDCT16                                                                 \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], in[15]);                 \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], in[15]);                 \
-    const __m128i lo_9_7 = _mm_unpacklo_epi16(in[9], in[7]);                   \
-    const __m128i hi_9_7 = _mm_unpackhi_epi16(in[9], in[7]);                   \
-    const __m128i lo_5_11 = _mm_unpacklo_epi16(in[5], in[11]);                 \
-    const __m128i hi_5_11 = _mm_unpackhi_epi16(in[5], in[11]);                 \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(in[13], in[3]);                 \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(in[13], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, stg2_0, stg2_1,   \
-                           stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, stg2_4, stg2_5, \
-                           stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], in[14]);                 \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], in[14]);                 \
-    const __m128i lo_10_6 = _mm_unpacklo_epi16(in[10], in[6]);                 \
-    const __m128i hi_10_6 = _mm_unpackhi_epi16(in[10], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, stg3_0, stg3_1, \
-                           stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)     \
-                                                                               \
-    stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);                                  \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-                                                                               \
-    stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);                               \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], in[8]);                   \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], in[8]);                   \
-    const __m128i lo_4_12 = _mm_unpacklo_epi16(in[4], in[12]);                 \
-    const __m128i hi_4_12 = _mm_unpackhi_epi16(in[4], in[12]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, stg4_0, stg4_1,   \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-  }
+    for (j = 0; j < 16; ++j) {
+      write_buffer_8x1(dest + j * stride, out[j]);
+    }
 
-#define IDCT16_10                                                              \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_15 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_15 = _mm_unpackhi_epi16(in[1], zero);                   \
-    const __m128i lo_13_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_13_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_13_3, hi_13_3, stg2_0, stg2_1, \
-                           stg2_6, stg2_7, stp1_8_0, stp1_15, stp1_11,         \
-                           stp1_12_0)                                          \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_14 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_14, hi_2_14, stg3_0, stg3_1, stp2_4, stp2_7) \
-                                                                               \
-    stp1_9 = stp1_8_0;                                                         \
-    stp1_10 = stp1_11;                                                         \
-                                                                               \
-    stp1_13 = stp1_12_0;                                                       \
-    stp1_14 = stp1_15;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);                    \
-    const __m128i hi_0_8 = _mm_unpackhi_epi16(in[0], zero);                    \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_8, hi_0_8, stg4_0, stg4_1, stp1_0, stp1_1)   \
-    stp2_5 = stp2_4;                                                           \
-    stp2_6 = stp2_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_7, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-                                                                               \
-    stp1_2 = stp1_1;                                                           \
-    stp1_3 = stp1_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);                                 \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);                                \
-                                                                               \
-    stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);                               \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);                               \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp2_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp2_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
+    dest += 8;
   }
+}
 
-void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
-                                int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[16], l[16], r[16], *curr1;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_8_0, stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i in[16], l[16];
   int i;
 
-  curr1 = l;
-  for (i = 0; i < 2; i++) {
-    // 1-D idct
-
-    // Load input data.
-    in[0] = load_input_data(input);
-    in[8] = load_input_data(input + 8 * 1);
-    in[1] = load_input_data(input + 8 * 2);
-    in[9] = load_input_data(input + 8 * 3);
-    in[2] = load_input_data(input + 8 * 4);
-    in[10] = load_input_data(input + 8 * 5);
-    in[3] = load_input_data(input + 8 * 6);
-    in[11] = load_input_data(input + 8 * 7);
-    in[4] = load_input_data(input + 8 * 8);
-    in[12] = load_input_data(input + 8 * 9);
-    in[5] = load_input_data(input + 8 * 10);
-    in[13] = load_input_data(input + 8 * 11);
-    in[6] = load_input_data(input + 8 * 12);
-    in[14] = load_input_data(input + 8 * 13);
-    in[7] = load_input_data(input + 8 * 14);
-    in[15] = load_input_data(input + 8 * 15);
-
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-
-    IDCT16
-
-    // Stage7
-    curr1[0] = _mm_add_epi16(stp2_0, stp1_15);
-    curr1[1] = _mm_add_epi16(stp2_1, stp1_14);
-    curr1[2] = _mm_add_epi16(stp2_2, stp2_13);
-    curr1[3] = _mm_add_epi16(stp2_3, stp2_12);
-    curr1[4] = _mm_add_epi16(stp2_4, stp2_11);
-    curr1[5] = _mm_add_epi16(stp2_5, stp2_10);
-    curr1[6] = _mm_add_epi16(stp2_6, stp1_9);
-    curr1[7] = _mm_add_epi16(stp2_7, stp1_8);
-    curr1[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    curr1[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    curr1[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    curr1[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    curr1[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    curr1[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    curr1[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    curr1[15] = _mm_sub_epi16(stp2_0, stp1_15);
-
-    curr1 = r;
-    input += 128;
-  }
-  for (i = 0; i < 2; i++) {
+  // First 1-D inverse DCT
+  // Load input data.
+  in[0] = load_input_data4(input + 0 * 16);
+  in[1] = load_input_data4(input + 1 * 16);
+  in[2] = load_input_data4(input + 2 * 16);
+  in[3] = load_input_data4(input + 3 * 16);
+
+  idct16x16_10_pass1(in, l);
+
+  // Second 1-D inverse transform, performed per 8x16 block
+  for (i = 0; i < 16; i += 8) {
     int j;
-    // 1-D idct
-    array_transpose_8x8(l + i * 8, in);
-    array_transpose_8x8(r + i * 8, in + 8);
-
-    IDCT16
-
-    // 2-D
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+    idct16x16_10_pass2(l + i, in);
 
     for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, in[j]);
     }
 
     dest += 8;
   }
 }
 
+static INLINE void recon_and_store_16(uint8_t *const dest, const __m128i in_x) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i d0, d1;
+
+  d0 = _mm_load_si128((__m128i *)(dest));
+  d1 = _mm_unpackhi_epi8(d0, zero);
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d1 = _mm_add_epi16(in_x, d1);
+  d0 = _mm_packus_epi16(d0, d1);
+  _mm_store_si128((__m128i *)(dest), d0);
+}
+
 void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, i;
-
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
+  int i;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (i = 0; i < 16; ++i) {
-    RECON_AND_STORE(dest + 0, dc_value);
-    RECON_AND_STORE(dest + 8, dc_value);
+    recon_and_store_16(dest, dc_value);
     dest += stride;
   }
 }
 
-static void iadst16_8col(__m128i *in) {
+static void iadst16_8col(__m128i *const in) {
   // perform 16x16 1-D ADST for 8 columns
   __m128i s[16], x[16], u[32], v[32];
   const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
@@ -1132,8 +589,8 @@ static void iadst16_8col(__m128i *in) {
   const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
@@ -1505,1718 +962,371 @@ static void iadst16_8col(__m128i *in) {
   u[6] = _mm_unpacklo_epi16(s[14], s[15]);
   u[7] = _mm_unpackhi_epi16(s[14], s[15]);
 
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
+  in[7] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_m16_m16);
+  in[8] = idct_calc_wraplow_sse2(u[0], u[1], k__cospi_p16_m16);
+  in[4] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_p16_p16);
+  in[11] = idct_calc_wraplow_sse2(u[2], u[3], k__cospi_m16_p16);
+  in[6] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_p16_p16);
+  in[9] = idct_calc_wraplow_sse2(u[4], u[5], k__cospi_m16_p16);
+  in[5] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_m16_m16);
+  in[10] = idct_calc_wraplow_sse2(u[6], u[7], k__cospi_p16_m16);
 
   in[0] = s[0];
   in[1] = _mm_sub_epi16(kZero, s[8]);
   in[2] = s[12];
   in[3] = _mm_sub_epi16(kZero, s[4]);
-  in[4] = _mm_packs_epi32(v[4], v[5]);
-  in[5] = _mm_packs_epi32(v[12], v[13]);
-  in[6] = _mm_packs_epi32(v[8], v[9]);
-  in[7] = _mm_packs_epi32(v[0], v[1]);
-  in[8] = _mm_packs_epi32(v[2], v[3]);
-  in[9] = _mm_packs_epi32(v[10], v[11]);
-  in[10] = _mm_packs_epi32(v[14], v[15]);
-  in[11] = _mm_packs_epi32(v[6], v[7]);
   in[12] = s[5];
   in[13] = _mm_sub_epi16(kZero, s[13]);
   in[14] = s[9];
   in[15] = _mm_sub_epi16(kZero, s[1]);
 }
 
-static void idct16_8col(__m128i *in) {
-  const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
-  const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  __m128i v[16], u[16], s[16], t[16];
+void idct16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
+  idct16_8col(in0, in0);
+  idct16_8col(in1, in1);
+}
 
-  // stage 1
-  s[0] = in[0];
-  s[1] = in[8];
-  s[2] = in[4];
-  s[3] = in[12];
-  s[4] = in[2];
-  s[5] = in[10];
-  s[6] = in[6];
-  s[7] = in[14];
-  s[8] = in[1];
-  s[9] = in[9];
-  s[10] = in[5];
-  s[11] = in[13];
-  s[12] = in[3];
-  s[13] = in[11];
-  s[14] = in[7];
-  s[15] = in[15];
+void iadst16_sse2(__m128i *const in0, __m128i *const in1) {
+  transpose_16bit_16x16(in0, in1);
+  iadst16_8col(in0);
+  iadst16_8col(in1);
+}
 
-  // stage 2
-  u[0] = _mm_unpacklo_epi16(s[8], s[15]);
-  u[1] = _mm_unpackhi_epi16(s[8], s[15]);
-  u[2] = _mm_unpacklo_epi16(s[9], s[14]);
-  u[3] = _mm_unpackhi_epi16(s[9], s[14]);
-  u[4] = _mm_unpacklo_epi16(s[10], s[13]);
-  u[5] = _mm_unpackhi_epi16(s[10], s[13]);
-  u[6] = _mm_unpacklo_epi16(s[11], s[12]);
-  u[7] = _mm_unpackhi_epi16(s[11], s[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p30_m02);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p30_m02);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p02_p30);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p02_p30);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p14_m18);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p14_m18);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p18_p14);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p18_p14);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_p22_m10);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_p22_m10);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p10_p22);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p10_p22);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_p06_m26);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_p06_m26);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_p26_p06);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_p26_p06);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[8] = _mm_packs_epi32(u[0], u[1]);
-  s[15] = _mm_packs_epi32(u[2], u[3]);
-  s[9] = _mm_packs_epi32(u[4], u[5]);
-  s[14] = _mm_packs_epi32(u[6], u[7]);
-  s[10] = _mm_packs_epi32(u[8], u[9]);
-  s[13] = _mm_packs_epi32(u[10], u[11]);
-  s[11] = _mm_packs_epi32(u[12], u[13]);
-  s[12] = _mm_packs_epi32(u[14], u[15]);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
+
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[8], step2[8];
 
   // stage 3
-  t[0] = s[0];
-  t[1] = s[1];
-  t[2] = s[2];
-  t[3] = s[3];
-  u[0] = _mm_unpacklo_epi16(s[4], s[7]);
-  u[1] = _mm_unpackhi_epi16(s[4], s[7]);
-  u[2] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[3] = _mm_unpackhi_epi16(s[5], s[6]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  t[4] = _mm_packs_epi32(u[0], u[1]);
-  t[7] = _mm_packs_epi32(u[2], u[3]);
-  t[5] = _mm_packs_epi32(u[4], u[5]);
-  t[6] = _mm_packs_epi32(u[6], u[7]);
-  t[8] = _mm_add_epi16(s[8], s[9]);
-  t[9] = _mm_sub_epi16(s[8], s[9]);
-  t[10] = _mm_sub_epi16(s[11], s[10]);
-  t[11] = _mm_add_epi16(s[10], s[11]);
-  t[12] = _mm_add_epi16(s[12], s[13]);
-  t[13] = _mm_sub_epi16(s[12], s[13]);
-  t[14] = _mm_sub_epi16(s[15], s[14]);
-  t[15] = _mm_add_epi16(s[14], s[15]);
+  butterfly(in[4], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
 
   // stage 4
-  u[0] = _mm_unpacklo_epi16(t[0], t[1]);
-  u[1] = _mm_unpackhi_epi16(t[0], t[1]);
-  u[2] = _mm_unpacklo_epi16(t[2], t[3]);
-  u[3] = _mm_unpackhi_epi16(t[2], t[3]);
-  u[4] = _mm_unpacklo_epi16(t[9], t[14]);
-  u[5] = _mm_unpackhi_epi16(t[9], t[14]);
-  u[6] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[7] = _mm_unpackhi_epi16(t[10], t[13]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p24_m08);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p24_m08);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
-  v[8] = _mm_madd_epi16(u[4], k__cospi_m08_p24);
-  v[9] = _mm_madd_epi16(u[5], k__cospi_m08_p24);
-  v[10] = _mm_madd_epi16(u[4], k__cospi_p24_p08);
-  v[11] = _mm_madd_epi16(u[5], k__cospi_p24_p08);
-  v[12] = _mm_madd_epi16(u[6], k__cospi_m24_m08);
-  v[13] = _mm_madd_epi16(u[7], k__cospi_m24_m08);
-  v[14] = _mm_madd_epi16(u[6], k__cospi_m08_p24);
-  v[15] = _mm_madd_epi16(u[7], k__cospi_m08_p24);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-  u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
-  u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
-  u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
-  u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
-  u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
-  u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
-  u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
-  u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_add_epi16(t[4], t[5]);
-  s[5] = _mm_sub_epi16(t[4], t[5]);
-  s[6] = _mm_sub_epi16(t[7], t[6]);
-  s[7] = _mm_add_epi16(t[6], t[7]);
-  s[8] = t[8];
-  s[15] = t[15];
-  s[9] = _mm_packs_epi32(u[8], u[9]);
-  s[14] = _mm_packs_epi32(u[10], u[11]);
-  s[10] = _mm_packs_epi32(u[12], u[13]);
-  s[13] = _mm_packs_epi32(u[14], u[15]);
-  s[11] = t[11];
-  s[12] = t[12];
+  step2[0] = butterfly_cospi16(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
 
   // stage 5
-  t[0] = _mm_add_epi16(s[0], s[3]);
-  t[1] = _mm_add_epi16(s[1], s[2]);
-  t[2] = _mm_sub_epi16(s[1], s[2]);
-  t[3] = _mm_sub_epi16(s[0], s[3]);
-  t[4] = s[4];
-  t[7] = s[7];
-
-  u[0] = _mm_unpacklo_epi16(s[5], s[6]);
-  u[1] = _mm_unpackhi_epi16(s[5], s[6]);
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  t[5] = _mm_packs_epi32(u[0], u[1]);
-  t[6] = _mm_packs_epi32(u[2], u[3]);
-
-  t[8] = _mm_add_epi16(s[8], s[11]);
-  t[9] = _mm_add_epi16(s[9], s[10]);
-  t[10] = _mm_sub_epi16(s[9], s[10]);
-  t[11] = _mm_sub_epi16(s[8], s[11]);
-  t[12] = _mm_sub_epi16(s[15], s[12]);
-  t[13] = _mm_sub_epi16(s[14], s[13]);
-  t[14] = _mm_add_epi16(s[13], s[14]);
-  t[15] = _mm_add_epi16(s[12], s[15]);
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
 
   // stage 6
-  s[0] = _mm_add_epi16(t[0], t[7]);
-  s[1] = _mm_add_epi16(t[1], t[6]);
-  s[2] = _mm_add_epi16(t[2], t[5]);
-  s[3] = _mm_add_epi16(t[3], t[4]);
-  s[4] = _mm_sub_epi16(t[3], t[4]);
-  s[5] = _mm_sub_epi16(t[2], t[5]);
-  s[6] = _mm_sub_epi16(t[1], t[6]);
-  s[7] = _mm_sub_epi16(t[0], t[7]);
-  s[8] = t[8];
-  s[9] = t[9];
-
-  u[0] = _mm_unpacklo_epi16(t[10], t[13]);
-  u[1] = _mm_unpackhi_epi16(t[10], t[13]);
-  u[2] = _mm_unpacklo_epi16(t[11], t[12]);
-  u[3] = _mm_unpackhi_epi16(t[11], t[12]);
-
-  v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
-  v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
-  v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
-  v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
-  v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
-  v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
-
-  u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
-  u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
-  u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
-  u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-  u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
-  u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
-  u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
-  u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
-  u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
-  s[10] = _mm_packs_epi32(u[0], u[1]);
-  s[13] = _mm_packs_epi32(u[2], u[3]);
-  s[11] = _mm_packs_epi32(u[4], u[5]);
-  s[12] = _mm_packs_epi32(u[6], u[7]);
-  s[14] = t[14];
-  s[15] = t[15];
-
-  // stage 7
-  in[0] = _mm_add_epi16(s[0], s[15]);
-  in[1] = _mm_add_epi16(s[1], s[14]);
-  in[2] = _mm_add_epi16(s[2], s[13]);
-  in[3] = _mm_add_epi16(s[3], s[12]);
-  in[4] = _mm_add_epi16(s[4], s[11]);
-  in[5] = _mm_add_epi16(s[5], s[10]);
-  in[6] = _mm_add_epi16(s[6], s[9]);
-  in[7] = _mm_add_epi16(s[7], s[8]);
-  in[8] = _mm_sub_epi16(s[7], s[8]);
-  in[9] = _mm_sub_epi16(s[6], s[9]);
-  in[10] = _mm_sub_epi16(s[5], s[10]);
-  in[11] = _mm_sub_epi16(s[4], s[11]);
-  in[12] = _mm_sub_epi16(s[3], s[12]);
-  in[13] = _mm_sub_epi16(s[2], s[13]);
-  in[14] = _mm_sub_epi16(s[1], s[14]);
-  in[15] = _mm_sub_epi16(s[0], s[15]);
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
 }
 
-void idct16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  idct16_8col(in0);
-  idct16_8col(in1);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  butterfly(in[2], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
 }
 
-void iadst16_sse2(__m128i *in0, __m128i *in1) {
-  array_transpose_16x16(in0, in1);
-  iadst16_8col(in0);
-  iadst16_8col(in1);
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
 }
 
-void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
   const __m128i zero = _mm_setzero_si128();
+  __m128i step1[32];
 
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+  // stage 1
+  butterfly(in[1], zero, cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(zero, in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+  butterfly(in[5], zero, cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(zero, in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
 
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
 
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-  const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+void idct32_34_8x32_sse2(const __m128i *const in /*in[32]*/,
+                         __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
 
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i in[16], l[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_8,
-      stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, stp1_8_0,
-      stp1_12_0;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-  // First 1-D inverse DCT
-  // Load input data.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 8 * 2);
-  in[2] = load_input_data(input + 8 * 4);
-  in[3] = load_input_data(input + 8 * 6);
-
-  TRANSPOSE_8X4(in[0], in[1], in[2], in[3], in[0], in[1]);
-
-  // Stage2
-  {
-    const __m128i lo_1_15 = _mm_unpackhi_epi16(in[0], zero);
-    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, in[1]);
-
-    tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
-    tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
-    tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
-    tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp2_8 = _mm_packs_epi32(tmp0, tmp2);
-    stp2_11 = _mm_packs_epi32(tmp5, tmp7);
-  }
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
+}
 
-  // Stage3
-  {
-    const __m128i lo_2_14 = _mm_unpacklo_epi16(in[1], zero);
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
+                               int stride) {
+  __m128i io[32], col[32];
+  int i;
 
-    tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
-    tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+  // Load input data. Only need to load the top left 8x8 block.
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_sse2(io, col);
 
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+  for (i = 0; i < 32; i += 8) {
+    int j;
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_sse2(io, io);
 
-    stp1_13 = _mm_unpackhi_epi64(stp2_11, zero);
-    stp1_14 = _mm_unpackhi_epi64(stp2_8, zero);
+    for (j = 0; j < 32; ++j) {
+      write_buffer_8x1(dest + j * stride, io[j]);
+    }
 
-    stp1_4 = _mm_packs_epi32(tmp0, tmp2);
+    dest += 8;
   }
+}
 
-  // Stage4
-  {
-    const __m128i lo_0_8 = _mm_unpacklo_epi16(in[0], zero);
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp1_14);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp1_13);
-
-    tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
-    tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
-    tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
-    tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
-    tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
-    tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
-
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp5 = _mm_add_epi32(tmp5, rounding);
-    tmp7 = _mm_add_epi32(tmp7, rounding);
-
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
-
-    stp1_0 = _mm_packs_epi32(tmp0, tmp0);
-    stp1_1 = _mm_packs_epi32(tmp2, tmp2);
-    stp2_9 = _mm_packs_epi32(tmp1, tmp3);
-    stp2_10 = _mm_packs_epi32(tmp5, tmp7);
-
-    stp2_6 = _mm_unpackhi_epi64(stp1_4, zero);
-  }
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_1(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
 
-  // Stage5 and Stage6
-  {
-    tmp0 = _mm_add_epi16(stp2_8, stp2_11);
-    tmp1 = _mm_sub_epi16(stp2_8, stp2_11);
-    tmp2 = _mm_add_epi16(stp2_9, stp2_10);
-    tmp3 = _mm_sub_epi16(stp2_9, stp2_10);
-
-    stp1_9 = _mm_unpacklo_epi64(tmp2, zero);
-    stp1_10 = _mm_unpacklo_epi64(tmp3, zero);
-    stp1_8 = _mm_unpacklo_epi64(tmp0, zero);
-    stp1_11 = _mm_unpacklo_epi64(tmp1, zero);
-
-    stp1_13 = _mm_unpackhi_epi64(tmp3, zero);
-    stp1_14 = _mm_unpackhi_epi64(tmp2, zero);
-    stp1_12 = _mm_unpackhi_epi64(tmp1, zero);
-    stp1_15 = _mm_unpackhi_epi64(tmp0, zero);
-  }
+  // stage 3
+  butterfly(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
 
-  // Stage6
-  {
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp1_4);
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
-
-    tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
-    tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
-    tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
-    tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
-    tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
-    tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
-
-    tmp1 = _mm_add_epi32(tmp1, rounding);
-    tmp3 = _mm_add_epi32(tmp3, rounding);
-    tmp0 = _mm_add_epi32(tmp0, rounding);
-    tmp2 = _mm_add_epi32(tmp2, rounding);
-    tmp4 = _mm_add_epi32(tmp4, rounding);
-    tmp6 = _mm_add_epi32(tmp6, rounding);
-
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
-
-    stp1_6 = _mm_packs_epi32(tmp3, tmp1);
-
-    stp2_10 = _mm_packs_epi32(tmp0, zero);
-    stp2_13 = _mm_packs_epi32(tmp2, zero);
-    stp2_11 = _mm_packs_epi32(tmp4, zero);
-    stp2_12 = _mm_packs_epi32(tmp6, zero);
-
-    tmp0 = _mm_add_epi16(stp1_0, stp1_4);
-    tmp1 = _mm_sub_epi16(stp1_0, stp1_4);
-    tmp2 = _mm_add_epi16(stp1_1, stp1_6);
-    tmp3 = _mm_sub_epi16(stp1_1, stp1_6);
-
-    stp2_0 = _mm_unpackhi_epi64(tmp0, zero);
-    stp2_1 = _mm_unpacklo_epi64(tmp2, zero);
-    stp2_2 = _mm_unpackhi_epi64(tmp2, zero);
-    stp2_3 = _mm_unpacklo_epi64(tmp0, zero);
-    stp2_4 = _mm_unpacklo_epi64(tmp1, zero);
-    stp2_5 = _mm_unpackhi_epi64(tmp3, zero);
-    stp2_6 = _mm_unpacklo_epi64(tmp3, zero);
-    stp2_7 = _mm_unpackhi_epi64(tmp1, zero);
-  }
+  // stage 4
+  butterfly(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
 
-  // Stage7. Left 8x16 only.
-  l[0] = _mm_add_epi16(stp2_0, stp1_15);
-  l[1] = _mm_add_epi16(stp2_1, stp1_14);
-  l[2] = _mm_add_epi16(stp2_2, stp2_13);
-  l[3] = _mm_add_epi16(stp2_3, stp2_12);
-  l[4] = _mm_add_epi16(stp2_4, stp2_11);
-  l[5] = _mm_add_epi16(stp2_5, stp2_10);
-  l[6] = _mm_add_epi16(stp2_6, stp1_9);
-  l[7] = _mm_add_epi16(stp2_7, stp1_8);
-  l[8] = _mm_sub_epi16(stp2_7, stp1_8);
-  l[9] = _mm_sub_epi16(stp2_6, stp1_9);
-  l[10] = _mm_sub_epi16(stp2_5, stp2_10);
-  l[11] = _mm_sub_epi16(stp2_4, stp2_11);
-  l[12] = _mm_sub_epi16(stp2_3, stp2_12);
-  l[13] = _mm_sub_epi16(stp2_2, stp2_13);
-  l[14] = _mm_sub_epi16(stp2_1, stp1_14);
-  l[15] = _mm_sub_epi16(stp2_0, stp1_15);
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
 
-  // Second 1-D inverse transform, performed per 8x16 block
-  for (i = 0; i < 2; i++) {
-    int j;
-    array_transpose_4X8(l + 8 * i, in);
-
-    IDCT16_10
-
-    // Stage7
-    in[0] = _mm_add_epi16(stp2_0, stp1_15);
-    in[1] = _mm_add_epi16(stp2_1, stp1_14);
-    in[2] = _mm_add_epi16(stp2_2, stp2_13);
-    in[3] = _mm_add_epi16(stp2_3, stp2_12);
-    in[4] = _mm_add_epi16(stp2_4, stp2_11);
-    in[5] = _mm_add_epi16(stp2_5, stp2_10);
-    in[6] = _mm_add_epi16(stp2_6, stp1_9);
-    in[7] = _mm_add_epi16(stp2_7, stp1_8);
-    in[8] = _mm_sub_epi16(stp2_7, stp1_8);
-    in[9] = _mm_sub_epi16(stp2_6, stp1_9);
-    in[10] = _mm_sub_epi16(stp2_5, stp2_10);
-    in[11] = _mm_sub_epi16(stp2_4, stp2_11);
-    in[12] = _mm_sub_epi16(stp2_3, stp2_12);
-    in[13] = _mm_sub_epi16(stp2_2, stp2_13);
-    in[14] = _mm_sub_epi16(stp2_1, stp1_14);
-    in[15] = _mm_sub_epi16(stp2_0, stp1_15);
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
+}
 
-    for (j = 0; j < 16; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
 
-    dest += 8;
-  }
+  // stage 2
+  butterfly(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
 }
 
-#define LOAD_DQCOEFF(reg, input)  \
-  {                               \
-    reg = load_input_data(input); \
-    input += 8;                   \
-  }
+static INLINE void idct32_1024_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_1024_8x32_quarter_1(in, temp);
+  idct32_1024_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
+}
 
-#define IDCT32_34                                                              \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero);                   \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero);                   \
-                                                                               \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(zero, in[7]);                   \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(zero, in[7]);                   \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], zero);                   \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], zero);                   \
-                                                                               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(zero, in[3]);                   \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(zero, in[3]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_1_31, hi_1_31, stg1_0, stg1_1, stp1_16,        \
-                             stp1_31);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_25_7, hi_25_7, stg1_6, stg1_7, stp1_19,        \
-                             stp1_28);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_5_27, hi_5_27, stg1_8, stg1_9, stp1_20,        \
-                             stp1_27);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_29_3, hi_29_3, stg1_14, stg1_15, stp1_23,      \
-                             stp1_24);                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero);                   \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero);                   \
-                                                                               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(zero, in[6]);                   \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(zero, in[6]);                   \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_2_30, hi_2_30, stg2_0, stg2_1, stp2_8,         \
-                             stp2_15);                                         \
-    MULTIPLICATION_AND_ADD_2(lo_26_6, hi_26_6, stg2_6, stg2_7, stp2_11,        \
-                             stp2_12);                                         \
-                                                                               \
-    stp2_16 = stp1_16;                                                         \
-    stp2_19 = stp1_19;                                                         \
-                                                                               \
-    stp2_20 = stp1_20;                                                         \
-    stp2_23 = stp1_23;                                                         \
-                                                                               \
-    stp2_24 = stp1_24;                                                         \
-    stp2_27 = stp1_27;                                                         \
-                                                                               \
-    stp2_28 = stp1_28;                                                         \
-    stp2_31 = stp1_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero);                   \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero);                   \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1_16, stp1_31);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1_16, stp1_31);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1_19, stp1_28);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1_19, stp1_28);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1_20, stp1_27);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1_20, stp1_27);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1_23, stp1_24);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1_23, stp2_24);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_4_28, hi_4_28, stg3_0, stg3_1, stp1_4,         \
-                             stp1_7);                                          \
-                                                                               \
-    stp1_8 = stp2_8;                                                           \
-    stp1_11 = stp2_11;                                                         \
-    stp1_12 = stp2_12;                                                         \
-    stp1_15 = stp2_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero);                   \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero);                   \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD_2(lo_0_16, hi_0_16, stg4_0, stg4_1, stp2_0,         \
-                             stp2_1);                                          \
-                                                                               \
-    stp2_4 = stp1_4;                                                           \
-    stp2_5 = stp1_4;                                                           \
-    stp2_6 = stp1_7;                                                           \
-    stp2_7 = stp1_7;                                                           \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = stp2_0;                                                           \
-    stp1_1 = stp2_1;                                                           \
-    stp1_2 = stp2_1;                                                           \
-    stp1_3 = stp2_0;                                                           \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+// For each 8x32 block __m128i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_1024_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
 
-#define IDCT32                                                                 \
-  /* Stage1 */                                                                 \
-  {                                                                            \
-    const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);                 \
-    const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);                 \
-    const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);               \
-    const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);               \
-                                                                               \
-    const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);                 \
-    const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);                 \
-    const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);                 \
-    const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);                 \
-                                                                               \
-    const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);                 \
-    const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);                 \
-    const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);               \
-    const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);               \
-                                                                               \
-    const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);               \
-    const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);               \
-    const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);                 \
-    const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,       \
-                           stg1_1, stg1_2, stg1_3, stp1_16, stp1_31, stp1_17,  \
-                           stp1_30)                                            \
-    MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, \
-                           stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) \
-    MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,       \
-                           stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,         \
-                           stp1_21, stp1_26)                                   \
-    MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,      \
-                           stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,        \
-                           stp1_23, stp1_24)                                   \
-  }                                                                            \
-                                                                               \
-  /* Stage2 */                                                                 \
-  {                                                                            \
-    const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);                 \
-    const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);                 \
-    const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);               \
-    const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);               \
-                                                                               \
-    const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);               \
-    const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);               \
-    const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);                 \
-    const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,       \
-                           stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,    \
-                           stp2_14)                                            \
-    MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,       \
-                           stg2_5, stg2_6, stg2_7, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_17);                                 \
-    stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_18);                                 \
-                                                                               \
-    stp2_20 = _mm_add_epi16(stp1_20, stp1_21);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_22);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_25);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);                                 \
-    stp2_27 = _mm_add_epi16(stp1_27, stp1_26);                                 \
-                                                                               \
-    stp2_28 = _mm_add_epi16(stp1_28, stp1_29);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);                                 \
-    stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_31, stp1_30);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage3 */                                                                 \
-  {                                                                            \
-    const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);                 \
-    const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);                 \
-    const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);               \
-    const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);               \
-                                                                               \
-    const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);             \
-    const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);             \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,       \
-                           stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,     \
-                           stp1_6)                                             \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_9);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);                                 \
-    stp1_11 = _mm_add_epi16(stp2_11, stp2_10);                                 \
-    stp1_12 = _mm_add_epi16(stp2_12, stp2_13);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);                                 \
-    stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_14);                                 \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,     \
-                           stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,  \
-                           stp1_29)                                            \
-    MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,     \
-                           stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, \
-                           stp1_25)                                            \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_31 = stp2_31;                                                         \
-    stp1_19 = stp2_19;                                                         \
-    stp1_20 = stp2_20;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_27 = stp2_27;                                                         \
-    stp1_28 = stp2_28;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage4 */                                                                 \
-  {                                                                            \
-    const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);                 \
-    const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);                 \
-    const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);                 \
-    const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);                 \
-                                                                               \
-    const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);               \
-    const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);               \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, \
-                           stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)     \
-                                                                               \
-    stp2_4 = _mm_add_epi16(stp1_4, stp1_5);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);                                    \
-    stp2_7 = _mm_add_epi16(stp1_7, stp1_6);                                    \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,       \
-                           stg4_5, stg4_6, stg4_4, stp2_9, stp2_14, stp2_10,   \
-                           stp2_13)                                            \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_15 = stp1_15;                                                         \
-    stp2_11 = stp1_11;                                                         \
-    stp2_12 = stp1_12;                                                         \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_19);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_18);                                 \
-    stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);                                 \
-    stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);                                 \
-    stp2_22 = _mm_add_epi16(stp1_22, stp1_21);                                 \
-    stp2_23 = _mm_add_epi16(stp1_23, stp1_20);                                 \
-                                                                               \
-    stp2_24 = _mm_add_epi16(stp1_24, stp1_27);                                 \
-    stp2_25 = _mm_add_epi16(stp1_25, stp1_26);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);                                 \
-    stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);                                 \
-    stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_29, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_28, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage5 */                                                                 \
-  {                                                                            \
-    const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);                 \
-    const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);                 \
-    const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);             \
-    const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);             \
-                                                                               \
-    const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);             \
-    const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);             \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-                                                                               \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_3);                                    \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_2);                                    \
-    stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);                                    \
-    stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);                                    \
-                                                                               \
-    tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);                                     \
-    tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);                                     \
-    tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);                                     \
-    tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);                                     \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-                                                                               \
-    stp1_5 = _mm_packs_epi32(tmp0, tmp1);                                      \
-    stp1_6 = _mm_packs_epi32(tmp2, tmp3);                                      \
-                                                                               \
-    stp1_4 = stp2_4;                                                           \
-    stp1_7 = stp2_7;                                                           \
-                                                                               \
-    stp1_8 = _mm_add_epi16(stp2_8, stp2_11);                                   \
-    stp1_9 = _mm_add_epi16(stp2_9, stp2_10);                                   \
-    stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);                                 \
-    stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);                                 \
-    stp1_14 = _mm_add_epi16(stp2_14, stp2_13);                                 \
-    stp1_15 = _mm_add_epi16(stp2_15, stp2_12);                                 \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,     \
-                           stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,  \
-                           stp1_28)                                            \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,     \
-                           stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-                                                                               \
-    stp1_22 = stp2_22;                                                         \
-    stp1_23 = stp2_23;                                                         \
-    stp1_24 = stp2_24;                                                         \
-    stp1_25 = stp2_25;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }                                                                            \
-                                                                               \
-  /* Stage6 */                                                                 \
-  {                                                                            \
-    const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);             \
-    const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);             \
-    const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);             \
-    const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);             \
-                                                                               \
-    stp2_0 = _mm_add_epi16(stp1_0, stp1_7);                                    \
-    stp2_1 = _mm_add_epi16(stp1_1, stp1_6);                                    \
-    stp2_2 = _mm_add_epi16(stp1_2, stp1_5);                                    \
-    stp2_3 = _mm_add_epi16(stp1_3, stp1_4);                                    \
-    stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);                                    \
-    stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);                                    \
-    stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);                                    \
-    stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);                                    \
-                                                                               \
-    stp2_8 = stp1_8;                                                           \
-    stp2_9 = stp1_9;                                                           \
-    stp2_14 = stp1_14;                                                         \
-    stp2_15 = stp1_15;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,  \
-                           stp2_12)                                            \
-                                                                               \
-    stp2_16 = _mm_add_epi16(stp1_16, stp1_23);                                 \
-    stp2_17 = _mm_add_epi16(stp1_17, stp1_22);                                 \
-    stp2_18 = _mm_add_epi16(stp1_18, stp1_21);                                 \
-    stp2_19 = _mm_add_epi16(stp1_19, stp1_20);                                 \
-    stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);                                 \
-    stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);                                 \
-    stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);                                 \
-    stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);                                 \
-                                                                               \
-    stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);                                 \
-    stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);                                 \
-    stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);                                 \
-    stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);                                 \
-    stp2_28 = _mm_add_epi16(stp1_27, stp1_28);                                 \
-    stp2_29 = _mm_add_epi16(stp1_26, stp1_29);                                 \
-    stp2_30 = _mm_add_epi16(stp1_25, stp1_30);                                 \
-    stp2_31 = _mm_add_epi16(stp1_24, stp1_31);                                 \
-  }                                                                            \
-                                                                               \
-  /* Stage7 */                                                                 \
-  {                                                                            \
-    const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);             \
-    const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);             \
-    const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);             \
-    const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);             \
-                                                                               \
-    const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);             \
-    const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);             \
-    const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);             \
-    const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);             \
-                                                                               \
-    stp1_0 = _mm_add_epi16(stp2_0, stp2_15);                                   \
-    stp1_1 = _mm_add_epi16(stp2_1, stp2_14);                                   \
-    stp1_2 = _mm_add_epi16(stp2_2, stp2_13);                                   \
-    stp1_3 = _mm_add_epi16(stp2_3, stp2_12);                                   \
-    stp1_4 = _mm_add_epi16(stp2_4, stp2_11);                                   \
-    stp1_5 = _mm_add_epi16(stp2_5, stp2_10);                                   \
-    stp1_6 = _mm_add_epi16(stp2_6, stp2_9);                                    \
-    stp1_7 = _mm_add_epi16(stp2_7, stp2_8);                                    \
-    stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);                                    \
-    stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);                                    \
-    stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);                                  \
-    stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);                                  \
-    stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);                                  \
-    stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);                                  \
-    stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);                                  \
-    stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);                                  \
-                                                                               \
-    stp1_16 = stp2_16;                                                         \
-    stp1_17 = stp2_17;                                                         \
-    stp1_18 = stp2_18;                                                         \
-    stp1_19 = stp2_19;                                                         \
-                                                                               \
-    MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,  \
-                           stp1_26)                                            \
-    MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,     \
-                           stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,  \
-                           stp1_24)                                            \
-                                                                               \
-    stp1_28 = stp2_28;                                                         \
-    stp1_29 = stp2_29;                                                         \
-    stp1_30 = stp2_30;                                                         \
-    stp1_31 = stp2_31;                                                         \
-  }
+  // stage 1
+  butterfly(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
 
-// Only upper-left 8x8 has non-zero coeff
-void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest,
-                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[32];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
+  butterfly(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
 
-  // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
-
-  array_transpose_8x8(in, in);
-  IDCT32_34
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  col[0] = _mm_add_epi16(stp1_0, stp1_31);
-  col[1] = _mm_add_epi16(stp1_1, stp1_30);
-  col[2] = _mm_add_epi16(stp1_2, stp1_29);
-  col[3] = _mm_add_epi16(stp1_3, stp1_28);
-  col[4] = _mm_add_epi16(stp1_4, stp1_27);
-  col[5] = _mm_add_epi16(stp1_5, stp1_26);
-  col[6] = _mm_add_epi16(stp1_6, stp1_25);
-  col[7] = _mm_add_epi16(stp1_7, stp1_24);
-  col[8] = _mm_add_epi16(stp1_8, stp1_23);
-  col[9] = _mm_add_epi16(stp1_9, stp1_22);
-  col[10] = _mm_add_epi16(stp1_10, stp1_21);
-  col[11] = _mm_add_epi16(stp1_11, stp1_20);
-  col[12] = _mm_add_epi16(stp1_12, stp1_19);
-  col[13] = _mm_add_epi16(stp1_13, stp1_18);
-  col[14] = _mm_add_epi16(stp1_14, stp1_17);
-  col[15] = _mm_add_epi16(stp1_15, stp1_16);
-  col[16] = _mm_sub_epi16(stp1_15, stp1_16);
-  col[17] = _mm_sub_epi16(stp1_14, stp1_17);
-  col[18] = _mm_sub_epi16(stp1_13, stp1_18);
-  col[19] = _mm_sub_epi16(stp1_12, stp1_19);
-  col[20] = _mm_sub_epi16(stp1_11, stp1_20);
-  col[21] = _mm_sub_epi16(stp1_10, stp1_21);
-  col[22] = _mm_sub_epi16(stp1_9, stp1_22);
-  col[23] = _mm_sub_epi16(stp1_8, stp1_23);
-  col[24] = _mm_sub_epi16(stp1_7, stp1_24);
-  col[25] = _mm_sub_epi16(stp1_6, stp1_25);
-  col[26] = _mm_sub_epi16(stp1_5, stp1_26);
-  col[27] = _mm_sub_epi16(stp1_4, stp1_27);
-  col[28] = _mm_sub_epi16(stp1_3, stp1_28);
-  col[29] = _mm_sub_epi16(stp1_2, stp1_29);
-  col[30] = _mm_sub_epi16(stp1_1, stp1_30);
-  col[31] = _mm_sub_epi16(stp1_0, stp1_31);
-  for (i = 0; i < 4; i++) {
-    int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    IDCT32_34
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+  butterfly(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
 
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
 
-    dest += 8;
-  }
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+void idct32_1024_8x32(const __m128i *const in /*in[32]*/,
+                      __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
+
+  idct32_1024_8x32_quarter_1_2(in, temp);
+  idct32_1024_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
 }
 
 void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
                                  int stride) {
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-
-  // idct constants for each stage
-  const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-  const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-  const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-  const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-  const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-  const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-  const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-  const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-  const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-  const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-  const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-  const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-  const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-  const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-
-  const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-  const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-  const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-  const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-  const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-  const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-
-  const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-  const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in[32], col[128], zero_idx[16];
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
-      stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
-      stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
-      stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
-      stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
-      stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
-      stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i, j, i32;
+  __m128i col[4][32], io[32];
+  int i;
 
+  // rows
   for (i = 0; i < 4; i++) {
-    i32 = (i << 5);
-    // First 1-D idct
-    // Load input data.
-    LOAD_DQCOEFF(in[0], input);
-    LOAD_DQCOEFF(in[8], input);
-    LOAD_DQCOEFF(in[16], input);
-    LOAD_DQCOEFF(in[24], input);
-    LOAD_DQCOEFF(in[1], input);
-    LOAD_DQCOEFF(in[9], input);
-    LOAD_DQCOEFF(in[17], input);
-    LOAD_DQCOEFF(in[25], input);
-    LOAD_DQCOEFF(in[2], input);
-    LOAD_DQCOEFF(in[10], input);
-    LOAD_DQCOEFF(in[18], input);
-    LOAD_DQCOEFF(in[26], input);
-    LOAD_DQCOEFF(in[3], input);
-    LOAD_DQCOEFF(in[11], input);
-    LOAD_DQCOEFF(in[19], input);
-    LOAD_DQCOEFF(in[27], input);
-
-    LOAD_DQCOEFF(in[4], input);
-    LOAD_DQCOEFF(in[12], input);
-    LOAD_DQCOEFF(in[20], input);
-    LOAD_DQCOEFF(in[28], input);
-    LOAD_DQCOEFF(in[5], input);
-    LOAD_DQCOEFF(in[13], input);
-    LOAD_DQCOEFF(in[21], input);
-    LOAD_DQCOEFF(in[29], input);
-    LOAD_DQCOEFF(in[6], input);
-    LOAD_DQCOEFF(in[14], input);
-    LOAD_DQCOEFF(in[22], input);
-    LOAD_DQCOEFF(in[30], input);
-    LOAD_DQCOEFF(in[7], input);
-    LOAD_DQCOEFF(in[15], input);
-    LOAD_DQCOEFF(in[23], input);
-    LOAD_DQCOEFF(in[31], input);
-
-    // checking if all entries are zero
-    zero_idx[0] = _mm_or_si128(in[0], in[1]);
-    zero_idx[1] = _mm_or_si128(in[2], in[3]);
-    zero_idx[2] = _mm_or_si128(in[4], in[5]);
-    zero_idx[3] = _mm_or_si128(in[6], in[7]);
-    zero_idx[4] = _mm_or_si128(in[8], in[9]);
-    zero_idx[5] = _mm_or_si128(in[10], in[11]);
-    zero_idx[6] = _mm_or_si128(in[12], in[13]);
-    zero_idx[7] = _mm_or_si128(in[14], in[15]);
-    zero_idx[8] = _mm_or_si128(in[16], in[17]);
-    zero_idx[9] = _mm_or_si128(in[18], in[19]);
-    zero_idx[10] = _mm_or_si128(in[20], in[21]);
-    zero_idx[11] = _mm_or_si128(in[22], in[23]);
-    zero_idx[12] = _mm_or_si128(in[24], in[25]);
-    zero_idx[13] = _mm_or_si128(in[26], in[27]);
-    zero_idx[14] = _mm_or_si128(in[28], in[29]);
-    zero_idx[15] = _mm_or_si128(in[30], in[31]);
-
-    zero_idx[0] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[1] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[2] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[3] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[4] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[5] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[6] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-    zero_idx[7] = _mm_or_si128(zero_idx[14], zero_idx[15]);
-
-    zero_idx[8] = _mm_or_si128(zero_idx[0], zero_idx[1]);
-    zero_idx[9] = _mm_or_si128(zero_idx[2], zero_idx[3]);
-    zero_idx[10] = _mm_or_si128(zero_idx[4], zero_idx[5]);
-    zero_idx[11] = _mm_or_si128(zero_idx[6], zero_idx[7]);
-    zero_idx[12] = _mm_or_si128(zero_idx[8], zero_idx[9]);
-    zero_idx[13] = _mm_or_si128(zero_idx[10], zero_idx[11]);
-    zero_idx[14] = _mm_or_si128(zero_idx[12], zero_idx[13]);
-
-    if (_mm_movemask_epi8(_mm_cmpeq_epi32(zero_idx[14], zero)) == 0xFFFF) {
-      col[i32 + 0] = _mm_setzero_si128();
-      col[i32 + 1] = _mm_setzero_si128();
-      col[i32 + 2] = _mm_setzero_si128();
-      col[i32 + 3] = _mm_setzero_si128();
-      col[i32 + 4] = _mm_setzero_si128();
-      col[i32 + 5] = _mm_setzero_si128();
-      col[i32 + 6] = _mm_setzero_si128();
-      col[i32 + 7] = _mm_setzero_si128();
-      col[i32 + 8] = _mm_setzero_si128();
-      col[i32 + 9] = _mm_setzero_si128();
-      col[i32 + 10] = _mm_setzero_si128();
-      col[i32 + 11] = _mm_setzero_si128();
-      col[i32 + 12] = _mm_setzero_si128();
-      col[i32 + 13] = _mm_setzero_si128();
-      col[i32 + 14] = _mm_setzero_si128();
-      col[i32 + 15] = _mm_setzero_si128();
-      col[i32 + 16] = _mm_setzero_si128();
-      col[i32 + 17] = _mm_setzero_si128();
-      col[i32 + 18] = _mm_setzero_si128();
-      col[i32 + 19] = _mm_setzero_si128();
-      col[i32 + 20] = _mm_setzero_si128();
-      col[i32 + 21] = _mm_setzero_si128();
-      col[i32 + 22] = _mm_setzero_si128();
-      col[i32 + 23] = _mm_setzero_si128();
-      col[i32 + 24] = _mm_setzero_si128();
-      col[i32 + 25] = _mm_setzero_si128();
-      col[i32 + 26] = _mm_setzero_si128();
-      col[i32 + 27] = _mm_setzero_si128();
-      col[i32 + 28] = _mm_setzero_si128();
-      col[i32 + 29] = _mm_setzero_si128();
-      col[i32 + 30] = _mm_setzero_si128();
-      col[i32 + 31] = _mm_setzero_si128();
-      continue;
-    }
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    load_transpose_16bit_8x8(&input[16], 32, &io[16]);
+    load_transpose_16bit_8x8(&input[24], 32, &io[24]);
+    idct32_1024_8x32(io, col[i]);
+    input += 32 << 3;
+  }
 
+  // columns
+  for (i = 0; i < 32; i += 8) {
     // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    IDCT32
-
-    // 1_D: Store 32 intermediate results for each 8x32 block.
-    col[i32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
-    col[i32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
-    col[i32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
-    col[i32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
-    col[i32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
-    col[i32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
-    col[i32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
-    col[i32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
-    col[i32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
-    col[i32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
-    col[i32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
-    col[i32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
-    col[i32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
-    col[i32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
-    col[i32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
-    col[i32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
-    col[i32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
-    col[i32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
-    col[i32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
-    col[i32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
-    col[i32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
-    col[i32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
-    col[i32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
-    col[i32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
-    col[i32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
-    col[i32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
-    col[i32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
-    col[i32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
-    col[i32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
-    col[i32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
-    col[i32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
-    col[i32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    transpose_16bit_8x8(col[2] + i, io + 16);
+    transpose_16bit_8x8(col[3] + i, io + 24);
+
+    idct32_1024_8x32(io, io);
+    store_buffer_8x32(io, dest, stride);
+    dest += 8;
   }
-  for (i = 0; i < 4; i++) {
-    // Second 1-D idct
-    j = i << 3;
+}
 
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    IDCT32
-
-    // 2_D: Calculate the results and store them to destination.
-    in[0] = _mm_add_epi16(stp1_0, stp1_31);
-    in[1] = _mm_add_epi16(stp1_1, stp1_30);
-    in[2] = _mm_add_epi16(stp1_2, stp1_29);
-    in[3] = _mm_add_epi16(stp1_3, stp1_28);
-    in[4] = _mm_add_epi16(stp1_4, stp1_27);
-    in[5] = _mm_add_epi16(stp1_5, stp1_26);
-    in[6] = _mm_add_epi16(stp1_6, stp1_25);
-    in[7] = _mm_add_epi16(stp1_7, stp1_24);
-    in[8] = _mm_add_epi16(stp1_8, stp1_23);
-    in[9] = _mm_add_epi16(stp1_9, stp1_22);
-    in[10] = _mm_add_epi16(stp1_10, stp1_21);
-    in[11] = _mm_add_epi16(stp1_11, stp1_20);
-    in[12] = _mm_add_epi16(stp1_12, stp1_19);
-    in[13] = _mm_add_epi16(stp1_13, stp1_18);
-    in[14] = _mm_add_epi16(stp1_14, stp1_17);
-    in[15] = _mm_add_epi16(stp1_15, stp1_16);
-    in[16] = _mm_sub_epi16(stp1_15, stp1_16);
-    in[17] = _mm_sub_epi16(stp1_14, stp1_17);
-    in[18] = _mm_sub_epi16(stp1_13, stp1_18);
-    in[19] = _mm_sub_epi16(stp1_12, stp1_19);
-    in[20] = _mm_sub_epi16(stp1_11, stp1_20);
-    in[21] = _mm_sub_epi16(stp1_10, stp1_21);
-    in[22] = _mm_sub_epi16(stp1_9, stp1_22);
-    in[23] = _mm_sub_epi16(stp1_8, stp1_23);
-    in[24] = _mm_sub_epi16(stp1_7, stp1_24);
-    in[25] = _mm_sub_epi16(stp1_6, stp1_25);
-    in[26] = _mm_sub_epi16(stp1_5, stp1_26);
-    in[27] = _mm_sub_epi16(stp1_4, stp1_27);
-    in[28] = _mm_sub_epi16(stp1_3, stp1_28);
-    in[29] = _mm_sub_epi16(stp1_2, stp1_29);
-    in[30] = _mm_sub_epi16(stp1_1, stp1_30);
-    in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+void vpx_idct32x32_135_add_sse2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m128i col[2][32], in[32], out[32];
+  int i;
 
-    for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
-    }
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm_setzero_si128();
+  }
 
+  // rows
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &in[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &in[8]);
+    idct32_1024_8x32(in, col[i]);
+    input += 32 << 3;
+  }
+
+  // columns
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, in);
+    transpose_16bit_8x8(col[1] + i, in + 8);
+    idct32_1024_8x32(in, out);
+    store_buffer_8x32(out, dest, stride);
     dest += 8;
   }
 }
@@ -3224,19 +1334,17 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest,
 void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest,
                               int stride) {
   __m128i dc_value;
-  const __m128i zero = _mm_setzero_si128();
-  int a, j;
+  int j;
+  tran_high_t a1;
+  tran_low_t out =
+      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
 
-  a = (int)dct_const_round_shift(input[0] * cospi_16_64);
-  a = (int)dct_const_round_shift(a * cospi_16_64);
-  a = ROUND_POWER_OF_TWO(a, 6);
-
-  dc_value = _mm_set1_epi16(a);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  dc_value = _mm_set1_epi16((int16_t)a1);
 
   for (j = 0; j < 32; ++j) {
-    RECON_AND_STORE(dest + 0 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 8 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 16 + j * stride, dc_value);
-    RECON_AND_STORE(dest + 24 + j * stride, dc_value);
+    recon_and_store_16(dest + j * stride + 0, dc_value);
+    recon_and_store_16(dest + j * stride + 16, dc_value);
   }
 }
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index 0460ab13b..5cd5098f1 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -12,272 +12,173 @@
 #define VPX_DSP_X86_INV_TXFM_SSE2_H_
 
 #include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/inv_txfm.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-// perform 8x8 transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
-  const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
-  const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
-  const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
+static INLINE void idct8x8_12_transpose_16bit_4x8(const __m128i *const in,
+                                                  __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 30 31 32 33  00 01 02 03
+  // in[1]: 20 21 22 23  10 11 12 13
+  // in[2]: 40 41 42 43  70 71 72 73
+  // in[3]: 50 51 52 53  60 61 62 63
+  // to:
+  // tr0_0: 00 10 01 11  02 12 03 13
+  // tr0_1: 20 30 21 31  22 32 23 33
+  // tr0_2: 40 50 41 51  42 52 43 53
+  // tr0_3: 60 70 61 71  62 72 63 73
+  const __m128i tr0_0 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i tr0_1 = _mm_unpacklo_epi16(in[1], in[0]);
+  const __m128i tr0_2 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i tr0_3 = _mm_unpackhi_epi16(in[3], in[2]);
 
+  // Unpack 32 bit elements resulting in:
+  // tr1_0: 00 10 20 30  01 11 21 31
+  // tr1_1: 02 12 22 32  03 13 23 33
+  // tr1_2: 40 50 60 70  41 51 61 71
+  // tr1_3: 42 52 62 72  43 53 63 73
   const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+  const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
   const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
-
-  res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
-  res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
-  res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
-  res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
-  res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
-  res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
-  res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
-  res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
+  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
+  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
+  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
+  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
 }
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);                   \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);                   \
-    const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1);                   \
-    const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3);                   \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5);                   \
-    const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7);                   \
-    const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5);                   \
-    const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7);                   \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);               \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);               \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);               \
-    const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                              \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                              \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                              \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                              \
-    out4 = _mm_unpacklo_epi64(tr1_1, tr1_5);                              \
-    out5 = _mm_unpackhi_epi64(tr1_1, tr1_5);                              \
-    out6 = _mm_unpacklo_epi64(tr1_3, tr1_7);                              \
-    out7 = _mm_unpackhi_epi64(tr1_3, tr1_7);                              \
-  }
 
-#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1)   \
-  {                                                     \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
-                                                        \
-    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
-    in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
-  }
+static INLINE __m128i dct_const_round_shift_sse2(const __m128i in) {
+  const __m128i t = _mm_add_epi32(in, _mm_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm_srai_epi32(t, DCT_CONST_BITS);
+}
 
-static INLINE void array_transpose_4X8(__m128i *in, __m128i *out) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
-  const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
-  const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+static INLINE __m128i idct_madd_round_shift_sse2(const __m128i in,
+                                                 const __m128i cospi) {
+  const __m128i t = _mm_madd_epi16(in, cospi);
+  return dct_const_round_shift_sse2(t);
+}
 
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
-  const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m128i idct_calc_wraplow_sse2(const __m128i in0,
+                                             const __m128i in1,
+                                             const __m128i x) {
+  const __m128i t0 = idct_madd_round_shift_sse2(in0, x);
+  const __m128i t1 = idct_madd_round_shift_sse2(in1, x);
+  return _mm_packs_epi32(t0, t1);
+}
 
-  out[0] = _mm_unpacklo_epi64(tr1_0, tr1_4);
-  out[1] = _mm_unpackhi_epi64(tr1_0, tr1_4);
-  out[2] = _mm_unpacklo_epi64(tr1_2, tr1_6);
-  out[3] = _mm_unpackhi_epi64(tr1_2, tr1_6);
+// Multiply elements by constants and add them together.
+static INLINE void butterfly(const __m128i in0, const __m128i in1, const int c0,
+                             const int c1, __m128i *const out0,
+                             __m128i *const out1) {
+  const __m128i cst0 = pair_set_epi16(c0, -c1);
+  const __m128i cst1 = pair_set_epi16(c1, c0);
+  const __m128i lo = _mm_unpacklo_epi16(in0, in1);
+  const __m128i hi = _mm_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_sse2(lo, hi, cst0);
+  *out1 = idct_calc_wraplow_sse2(lo, hi, cst1);
 }
 
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
-  __m128i tbuf[8];
-  array_transpose_8x8(res0, res0);
-  array_transpose_8x8(res1, tbuf);
-  array_transpose_8x8(res0 + 8, res1);
-  array_transpose_8x8(res1 + 8, res1 + 8);
-
-  res0[8] = tbuf[0];
-  res0[9] = tbuf[1];
-  res0[10] = tbuf[2];
-  res0[11] = tbuf[3];
-  res0[12] = tbuf[4];
-  res0[13] = tbuf[5];
-  res0[14] = tbuf[6];
-  res0[15] = tbuf[7];
+static INLINE __m128i butterfly_cospi16(const __m128i in) {
+  const __m128i cst = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i lo = _mm_unpacklo_epi16(in, _mm_setzero_si128());
+  const __m128i hi = _mm_unpackhi_epi16(in, _mm_setzero_si128());
+  return idct_calc_wraplow_sse2(lo, hi, cst);
 }
 
-// Function to allow 8 bit optimisations to be used when profile 0 is used with
+// Functions to allow 8 bit optimisations to be used when profile 0 is used with
 // highbitdepth enabled
-static INLINE __m128i load_input_data(const tran_low_t *data) {
+static INLINE __m128i load_input_data4(const tran_low_t *data) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                        data[6], data[7]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in = _mm_load_si128((const __m128i *)data);
+  return _mm_packs_epi32(in, zero);
 #else
-  return _mm_load_si128((const __m128i *)data);
+  return _mm_loadl_epi64((const __m128i *)data);
 #endif
 }
 
-static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
-  in[0] = load_input_data(input + 0 * 16);
-  in[1] = load_input_data(input + 1 * 16);
-  in[2] = load_input_data(input + 2 * 16);
-  in[3] = load_input_data(input + 3 * 16);
-  in[4] = load_input_data(input + 4 * 16);
-  in[5] = load_input_data(input + 5 * 16);
-  in[6] = load_input_data(input + 6 * 16);
-  in[7] = load_input_data(input + 7 * 16);
-
-  in[8] = load_input_data(input + 8 * 16);
-  in[9] = load_input_data(input + 9 * 16);
-  in[10] = load_input_data(input + 10 * 16);
-  in[11] = load_input_data(input + 11 * 16);
-  in[12] = load_input_data(input + 12 * 16);
-  in[13] = load_input_data(input + 13 * 16);
-  in[14] = load_input_data(input + 14 * 16);
-  in[15] = load_input_data(input + 15 * 16);
+static INLINE __m128i load_input_data8(const tran_low_t *data) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m128i in0 = _mm_load_si128((const __m128i *)data);
+  const __m128i in1 = _mm_load_si128((const __m128i *)(data + 4));
+  return _mm_packs_epi32(in0, in1);
+#else
+  return _mm_load_si128((const __m128i *)data);
+#endif
 }
 
-#define RECON_AND_STORE(dest, in_x)                  \
-  {                                                  \
-    __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-    d0 = _mm_unpacklo_epi8(d0, zero);                \
-    d0 = _mm_add_epi16(in_x, d0);                    \
-    d0 = _mm_packus_epi16(d0, d0);                   \
-    _mm_storel_epi64((__m128i *)(dest), d0);         \
-  }
+static INLINE void load_transpose_16bit_8x8(const tran_low_t *input,
+                                            const int stride,
+                                            __m128i *const in) {
+  in[0] = load_input_data8(input + 0 * stride);
+  in[1] = load_input_data8(input + 1 * stride);
+  in[2] = load_input_data8(input + 2 * stride);
+  in[3] = load_input_data8(input + 3 * stride);
+  in[4] = load_input_data8(input + 4 * stride);
+  in[5] = load_input_data8(input + 5 * stride);
+  in[6] = load_input_data8(input + 6 * stride);
+  in[7] = load_input_data8(input + 7 * stride);
+  transpose_16bit_8x8(in, in);
+}
 
-static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+static INLINE void recon_and_store(uint8_t *const dest, const __m128i in_x) {
   const __m128i zero = _mm_setzero_si128();
-  // Final rounding and shift
-  in[0] = _mm_adds_epi16(in[0], final_rounding);
-  in[1] = _mm_adds_epi16(in[1], final_rounding);
-  in[2] = _mm_adds_epi16(in[2], final_rounding);
-  in[3] = _mm_adds_epi16(in[3], final_rounding);
-  in[4] = _mm_adds_epi16(in[4], final_rounding);
-  in[5] = _mm_adds_epi16(in[5], final_rounding);
-  in[6] = _mm_adds_epi16(in[6], final_rounding);
-  in[7] = _mm_adds_epi16(in[7], final_rounding);
-  in[8] = _mm_adds_epi16(in[8], final_rounding);
-  in[9] = _mm_adds_epi16(in[9], final_rounding);
-  in[10] = _mm_adds_epi16(in[10], final_rounding);
-  in[11] = _mm_adds_epi16(in[11], final_rounding);
-  in[12] = _mm_adds_epi16(in[12], final_rounding);
-  in[13] = _mm_adds_epi16(in[13], final_rounding);
-  in[14] = _mm_adds_epi16(in[14], final_rounding);
-  in[15] = _mm_adds_epi16(in[15], final_rounding);
-
-  in[0] = _mm_srai_epi16(in[0], 6);
-  in[1] = _mm_srai_epi16(in[1], 6);
-  in[2] = _mm_srai_epi16(in[2], 6);
-  in[3] = _mm_srai_epi16(in[3], 6);
-  in[4] = _mm_srai_epi16(in[4], 6);
-  in[5] = _mm_srai_epi16(in[5], 6);
-  in[6] = _mm_srai_epi16(in[6], 6);
-  in[7] = _mm_srai_epi16(in[7], 6);
-  in[8] = _mm_srai_epi16(in[8], 6);
-  in[9] = _mm_srai_epi16(in[9], 6);
-  in[10] = _mm_srai_epi16(in[10], 6);
-  in[11] = _mm_srai_epi16(in[11], 6);
-  in[12] = _mm_srai_epi16(in[12], 6);
-  in[13] = _mm_srai_epi16(in[13], 6);
-  in[14] = _mm_srai_epi16(in[14], 6);
-  in[15] = _mm_srai_epi16(in[15], 6);
-
-  RECON_AND_STORE(dest + 0 * stride, in[0]);
-  RECON_AND_STORE(dest + 1 * stride, in[1]);
-  RECON_AND_STORE(dest + 2 * stride, in[2]);
-  RECON_AND_STORE(dest + 3 * stride, in[3]);
-  RECON_AND_STORE(dest + 4 * stride, in[4]);
-  RECON_AND_STORE(dest + 5 * stride, in[5]);
-  RECON_AND_STORE(dest + 6 * stride, in[6]);
-  RECON_AND_STORE(dest + 7 * stride, in[7]);
-  RECON_AND_STORE(dest + 8 * stride, in[8]);
-  RECON_AND_STORE(dest + 9 * stride, in[9]);
-  RECON_AND_STORE(dest + 10 * stride, in[10]);
-  RECON_AND_STORE(dest + 11 * stride, in[11]);
-  RECON_AND_STORE(dest + 12 * stride, in[12]);
-  RECON_AND_STORE(dest + 13 * stride, in[13]);
-  RECON_AND_STORE(dest + 14 * stride, in[14]);
-  RECON_AND_STORE(dest + 15 * stride, in[15]);
+  __m128i d0 = _mm_loadl_epi64((__m128i *)(dest));
+  d0 = _mm_unpacklo_epi8(d0, zero);
+  d0 = _mm_add_epi16(in_x, d0);
+  d0 = _mm_packus_epi16(d0, d0);
+  _mm_storel_epi64((__m128i *)(dest), d0);
 }
 
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
-  {                                                                      \
-    const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1);                \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0);                \
-    const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3);                \
-    const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2);                \
-                                                                         \
-    const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);              \
-    const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);              \
-    const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);              \
-                                                                         \
-    out0 = _mm_unpacklo_epi64(tr1_0, tr1_4);                             \
-    out1 = _mm_unpackhi_epi64(tr1_0, tr1_4);                             \
-    out2 = _mm_unpacklo_epi64(tr1_2, tr1_6);                             \
-    out3 = _mm_unpackhi_epi64(tr1_2, tr1_6);                             \
-  }
+static INLINE void round_shift_8x8(const __m128i *const in,
+                                   __m128i *const out) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
 
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
-  {                                                      \
-    const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1);  \
-    const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3);  \
-    out0 = _mm_unpacklo_epi32(tr0_0, tr0_1);             \
-    out1 = _mm_unpackhi_epi32(tr0_0, tr0_1);             \
-  }
+  out[0] = _mm_add_epi16(in[0], final_rounding);
+  out[1] = _mm_add_epi16(in[1], final_rounding);
+  out[2] = _mm_add_epi16(in[2], final_rounding);
+  out[3] = _mm_add_epi16(in[3], final_rounding);
+  out[4] = _mm_add_epi16(in[4], final_rounding);
+  out[5] = _mm_add_epi16(in[5], final_rounding);
+  out[6] = _mm_add_epi16(in[6], final_rounding);
+  out[7] = _mm_add_epi16(in[7], final_rounding);
 
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
-                               res0, res1, res2, res3)                         \
-  {                                                                            \
-    tmp0 = _mm_madd_epi16(lo_0, cst0);                                         \
-    tmp1 = _mm_madd_epi16(hi_0, cst0);                                         \
-    tmp2 = _mm_madd_epi16(lo_0, cst1);                                         \
-    tmp3 = _mm_madd_epi16(hi_0, cst1);                                         \
-    tmp4 = _mm_madd_epi16(lo_1, cst2);                                         \
-    tmp5 = _mm_madd_epi16(hi_1, cst2);                                         \
-    tmp6 = _mm_madd_epi16(lo_1, cst3);                                         \
-    tmp7 = _mm_madd_epi16(hi_1, cst3);                                         \
-                                                                               \
-    tmp0 = _mm_add_epi32(tmp0, rounding);                                      \
-    tmp1 = _mm_add_epi32(tmp1, rounding);                                      \
-    tmp2 = _mm_add_epi32(tmp2, rounding);                                      \
-    tmp3 = _mm_add_epi32(tmp3, rounding);                                      \
-    tmp4 = _mm_add_epi32(tmp4, rounding);                                      \
-    tmp5 = _mm_add_epi32(tmp5, rounding);                                      \
-    tmp6 = _mm_add_epi32(tmp6, rounding);                                      \
-    tmp7 = _mm_add_epi32(tmp7, rounding);                                      \
-                                                                               \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);                               \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);                               \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);                               \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);                               \
-    tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);                               \
-    tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);                               \
-    tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);                               \
-    tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);                               \
-                                                                               \
-    res0 = _mm_packs_epi32(tmp0, tmp1);                                        \
-    res1 = _mm_packs_epi32(tmp2, tmp3);                                        \
-    res2 = _mm_packs_epi32(tmp4, tmp5);                                        \
-    res3 = _mm_packs_epi32(tmp6, tmp7);                                        \
-  }
+  out[0] = _mm_srai_epi16(out[0], 5);
+  out[1] = _mm_srai_epi16(out[1], 5);
+  out[2] = _mm_srai_epi16(out[2], 5);
+  out[3] = _mm_srai_epi16(out[3], 5);
+  out[4] = _mm_srai_epi16(out[4], 5);
+  out[5] = _mm_srai_epi16(out[5], 5);
+  out[6] = _mm_srai_epi16(out[6], 5);
+  out[7] = _mm_srai_epi16(out[7], 5);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *const in,
+                                    uint8_t *const dest, const int stride) {
+  __m128i t[8];
+
+  round_shift_8x8(in, t);
+
+  recon_and_store(dest + 0 * stride, t[0]);
+  recon_and_store(dest + 1 * stride, t[1]);
+  recon_and_store(dest + 2 * stride, t[2]);
+  recon_and_store(dest + 3 * stride, t[3]);
+  recon_and_store(dest + 4 * stride, t[4]);
+  recon_and_store(dest + 5 * stride, t[5]);
+  recon_and_store(dest + 6 * stride, t[6]);
+  recon_and_store(dest + 7 * stride, t[7]);
+}
 
 static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
                                            uint8_t *const dest,
@@ -307,11 +208,502 @@ static INLINE void recon_and_store4x4_sse2(const __m128i *const in,
   *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d[0]);
 }
 
-void idct4_sse2(__m128i *in);
-void idct8_sse2(__m128i *in);
-void idct16_sse2(__m128i *in0, __m128i *in1);
-void iadst4_sse2(__m128i *in);
-void iadst8_sse2(__m128i *in);
-void iadst16_sse2(__m128i *in0, __m128i *in1);
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm_srai_epi16(in[j], 6);
+    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+    recon_and_store(dst, in[j]);
+    dst += stride;
+    recon_and_store(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void write_buffer_8x1(uint8_t *const dest, const __m128i in) {
+  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+  __m128i out;
+  out = _mm_adds_epi16(in, final_rounding);
+  out = _mm_srai_epi16(out, 6);
+  recon_and_store(dest, out);
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
+                                     int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+static INLINE void idct8(const __m128i *const in /*in[8]*/,
+                         __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 1
+  butterfly(in[1], in[7], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[5], in[3], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 2
+  butterfly(in[0], in[4], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[2], in[6], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  out[0] = _mm_add_epi16(step1[0], step2[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step2[4]);
+  out[4] = _mm_sub_epi16(step1[3], step2[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+static INLINE void idct8x8_12_add_kernel_sse2(__m128i *const io /*io[8]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i cp_16_16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  __m128i step1[8], step2[8], tmp[4];
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  {
+    const __m128i cp_28_n4 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i cp_4_28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i cp_n20_12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+    const __m128i cp_12_20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+    const __m128i lo_1 = _mm_unpackhi_epi16(io[0], zero);
+    const __m128i lo_3 = _mm_unpackhi_epi16(io[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(cp_28_n4, cp_4_28, lo_1);    // step1 4&7
+    step1[5] = idct_calc_wraplow_sse2(cp_n20_12, cp_12_20, lo_3);  // step1 5&6
+  }
+
+  // stage 2
+  {
+    const __m128i cp_24_n8 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+    const __m128i cp_8_24 = pair_set_epi16(cospi_8_64, cospi_24_64);
+    const __m128i lo_0 = _mm_unpacklo_epi16(io[0], zero);
+    const __m128i lo_2 = _mm_unpacklo_epi16(io[1], zero);
+    const __m128i t = idct_madd_round_shift_sse2(cp_16_16, lo_0);
+    step2[0] = _mm_packs_epi32(t, t);                            // step2 0&1
+    step2[2] = idct_calc_wraplow_sse2(cp_8_24, cp_24_n8, lo_2);  // step2 3&2
+    step2[4] = _mm_add_epi16(step1[4], step1[5]);                // step2 4&7
+    step2[5] = _mm_sub_epi16(step1[4], step1[5]);                // step2 5&6
+    step2[6] = _mm_unpackhi_epi64(step2[5], zero);               // step2 6
+  }
+
+  // stage 3
+  {
+    const __m128i lo_65 = _mm_unpacklo_epi16(step2[6], step2[5]);
+    tmp[0] = _mm_add_epi16(step2[0], step2[2]);                     // step1 0&1
+    tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                     // step1 3&2
+    step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                  // step1 2&1
+    step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                  // step1 3&0
+    step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, lo_65);  // step1 5&6
+  }
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+  io[4] = io[5] = io[6] = io[7] = zero;
+
+  idct8(io, io);
+}
+
+static INLINE void idct16_8col(const __m128i *const in /*in[16]*/,
+                               __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  butterfly(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  out[0] = _mm_add_epi16(step2[0], step1[15]);
+  out[1] = _mm_add_epi16(step2[1], step1[14]);
+  out[2] = _mm_add_epi16(step2[2], step2[13]);
+  out[3] = _mm_add_epi16(step2[3], step2[12]);
+  out[4] = _mm_add_epi16(step2[4], step2[11]);
+  out[5] = _mm_add_epi16(step2[5], step2[10]);
+  out[6] = _mm_add_epi16(step2[6], step1[9]);
+  out[7] = _mm_add_epi16(step2[7], step1[8]);
+  out[8] = _mm_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass1(const __m128i *const input /*input[4]*/,
+                                      __m128i *const output /*output[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i k__cospi_p16_p16 = pair_set_epi16(cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x4(input, output);
+
+  // stage 2
+  {
+    const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+    const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
+    const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+    const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
+    const __m128i lo_1_15 = _mm_unpackhi_epi16(output[0], zero);
+    const __m128i lo_13_3 = _mm_unpackhi_epi16(zero, output[1]);
+    step2[8] = idct_calc_wraplow_sse2(k__cospi_p30_m02, k__cospi_p02_p30,
+                                      lo_1_15);  // step2 8&15
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_p06_m26, k__cospi_p26_p06,
+                                       lo_13_3);  // step2 11&12
+  }
+
+  // stage 3
+  {
+    const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+    const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
+    const __m128i lo_2_14 = _mm_unpacklo_epi16(output[1], zero);
+    step1[4] = idct_calc_wraplow_sse2(k__cospi_p28_m04, k__cospi_p04_p28,
+                                      lo_2_14);  // step1 4&7
+    step1[13] = _mm_unpackhi_epi64(step2[11], zero);
+    step1[14] = _mm_unpackhi_epi64(step2[8], zero);
+  }
+
+  // stage 4
+  {
+    const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+    const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+    const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+    const __m128i lo_0_8 = _mm_unpacklo_epi16(output[0], zero);
+    const __m128i lo_9_14 = _mm_unpacklo_epi16(step2[8], step1[14]);
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step2[11], step1[13]);
+    const __m128i t = idct_madd_round_shift_sse2(lo_0_8, k__cospi_p16_p16);
+    step1[0] = _mm_packs_epi32(t, t);  // step2 0&1
+    step2[9] = idct_calc_wraplow_sse2(k__cospi_m08_p24, k__cospi_p24_p08,
+                                      lo_9_14);  // step2 9&14
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m24_m08, k__cospi_m08_p24,
+                                       lo_10_13);  // step2 10&13
+    step2[6] = _mm_unpackhi_epi64(step1[4], zero);
+  }
+
+  // stage 5
+  {
+    const __m128i lo_5_6 = _mm_unpacklo_epi16(step1[4], step2[6]);
+    step1[6] = idct_calc_wraplow_sse2(k__cospi_p16_p16, k__cospi_m16_p16,
+                                      lo_5_6);  // step1 6&5
+    step1[8] = _mm_add_epi16(step2[8], step2[11]);
+    step1[9] = _mm_add_epi16(step2[9], step2[10]);
+    step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+    step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+    step1[12] = _mm_unpackhi_epi64(step1[11], zero);
+    step1[13] = _mm_unpackhi_epi64(step1[10], zero);
+    step1[14] = _mm_unpackhi_epi64(step1[9], zero);
+    step1[15] = _mm_unpackhi_epi64(step1[8], zero);
+  }
+
+  // stage 6
+  {
+    const __m128i lo_10_13 = _mm_unpacklo_epi16(step1[10], step1[13]);
+    const __m128i lo_11_12 = _mm_unpacklo_epi16(step1[11], step1[12]);
+    step2[10] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_10_13);  // step2 10&13
+    step2[11] = idct_calc_wraplow_sse2(k__cospi_m16_p16, k__cospi_p16_p16,
+                                       lo_11_12);  // step2 11&12
+    step2[13] = _mm_unpackhi_epi64(step2[10], zero);
+    step2[12] = _mm_unpackhi_epi64(step2[11], zero);
+    step2[3] = _mm_add_epi16(step1[0], step1[4]);
+    step2[1] = _mm_add_epi16(step1[0], step1[6]);
+    step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+    step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+    step2[0] = _mm_unpackhi_epi64(step2[3], zero);
+    step2[2] = _mm_unpackhi_epi64(step2[1], zero);
+    step2[5] = _mm_unpackhi_epi64(step2[6], zero);
+    step2[7] = _mm_unpackhi_epi64(step2[4], zero);
+  }
+
+  // stage 7. Left 8x16 only.
+  output[0] = _mm_add_epi16(step2[0], step1[15]);
+  output[1] = _mm_add_epi16(step2[1], step1[14]);
+  output[2] = _mm_add_epi16(step2[2], step2[13]);
+  output[3] = _mm_add_epi16(step2[3], step2[12]);
+  output[4] = _mm_add_epi16(step2[4], step2[11]);
+  output[5] = _mm_add_epi16(step2[5], step2[10]);
+  output[6] = _mm_add_epi16(step2[6], step1[9]);
+  output[7] = _mm_add_epi16(step2[7], step1[8]);
+  output[8] = _mm_sub_epi16(step2[7], step1[8]);
+  output[9] = _mm_sub_epi16(step2[6], step1[9]);
+  output[10] = _mm_sub_epi16(step2[5], step2[10]);
+  output[11] = _mm_sub_epi16(step2[4], step2[11]);
+  output[12] = _mm_sub_epi16(step2[3], step2[12]);
+  output[13] = _mm_sub_epi16(step2[2], step2[13]);
+  output[14] = _mm_sub_epi16(step2[1], step1[14]);
+  output[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct16x16_10_pass2(__m128i *const l /*l[8]*/,
+                                      __m128i *const io /*io[16]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i step1[16], step2[16];
+
+  transpose_16bit_4x8(l, io);
+
+  // stage 2
+  butterfly(io[1], zero, cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly(zero, io[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly(io[2], zero, cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step1[0] = butterfly_cospi16(io[0]);
+  butterfly(step2[15], step2[8], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step2[11], step2[12], -cospi_8_64, -cospi_24_64, &step2[13],
+            &step2[10]);
+
+  // stage 5
+  butterfly(step1[7], step1[4], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm_add_epi16(step1[0], step1[6]);
+  step2[2] = _mm_add_epi16(step1[0], step1[5]);
+  step2[3] = _mm_add_epi16(step1[0], step1[4]);
+  step2[4] = _mm_sub_epi16(step1[0], step1[4]);
+  step2[5] = _mm_sub_epi16(step1[0], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[0], step1[6]);
+  step2[7] = _mm_sub_epi16(step1[0], step1[7]);
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+            &step2[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+            &step2[12]);
+
+  // stage 7
+  io[0] = _mm_add_epi16(step2[0], step1[15]);
+  io[1] = _mm_add_epi16(step2[1], step1[14]);
+  io[2] = _mm_add_epi16(step2[2], step2[13]);
+  io[3] = _mm_add_epi16(step2[3], step2[12]);
+  io[4] = _mm_add_epi16(step2[4], step2[11]);
+  io[5] = _mm_add_epi16(step2[5], step2[10]);
+  io[6] = _mm_add_epi16(step2[6], step1[9]);
+  io[7] = _mm_add_epi16(step2[7], step1[8]);
+  io[8] = _mm_sub_epi16(step2[7], step1[8]);
+  io[9] = _mm_sub_epi16(step2[6], step1[9]);
+  io[10] = _mm_sub_epi16(step2[5], step2[10]);
+  io[11] = _mm_sub_epi16(step2[4], step2[11]);
+  io[12] = _mm_sub_epi16(step2[3], step2[12]);
+  io[13] = _mm_sub_epi16(step2[2], step2[13]);
+  io[14] = _mm_sub_epi16(step2[1], step1[14]);
+  io[15] = _mm_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void idct32_8x32_quarter_2_stage_4_to_6(
+    __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+            &step2[14]);
+  butterfly(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+            &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]);
+  butterfly(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+static INLINE void idct32_8x32_quarter_3_4_stage_4_to_7(
+    __m128i *const step1 /*step1[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step2[32];
+
+  // stage 4
+  step2[16] = _mm_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+            &step1[29]);
+  butterfly(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+            &step1[28]);
+  butterfly(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+            &step1[27]);
+  butterfly(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+            &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm_add_epi16(step1[16], step1[23]);
+  out[17] = _mm_add_epi16(step1[17], step1[22]);
+  out[18] = _mm_add_epi16(step1[18], step1[21]);
+  out[19] = _mm_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm_add_epi16(step1[27], step1[28]);
+  out[29] = _mm_add_epi16(step1[26], step1[29]);
+  out[30] = _mm_add_epi16(step1[25], step1[30]);
+  out[31] = _mm_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20], &out[27]);
+  butterfly(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21], &out[26]);
+  butterfly(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22], &out[25]);
+  butterfly(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23], &out[24]);
+}
+
+void idct4_sse2(__m128i *const in);
+void idct8_sse2(__m128i *const in);
+void idct16_sse2(__m128i *const in0, __m128i *const in1);
+void iadst4_sse2(__m128i *const in);
+void iadst8_sse2(__m128i *const in);
+void iadst16_sse2(__m128i *const in0, __m128i *const in1);
+void idct32_1024_8x32(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_sse2(const __m128i *const in, __m128i *const out);
+void idct32_34_8x32_ssse3(const __m128i *const in, __m128i *const out);
 
 #endif  // VPX_DSP_X86_INV_TXFM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
index 4d2d95787..6e99469b6 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -12,1322 +12,353 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/inv_txfm_ssse3.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 
-void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-  const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-  const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-  const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
-  int i;
-
-  // Load input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-  in4 = load_input_data(input + 8 * 4);
-  in5 = load_input_data(input + 8 * 5);
-  in6 = load_input_data(input + 8 * 6);
-  in7 = load_input_data(input + 8 * 7);
-
-  // 2-D
-  for (i = 0; i < 2; i++) {
-    // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
-    TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
-                  in4, in5, in6, in7);
-
-    // 4-stage 1D idct8x8
-    {
-      /* Stage1 */
-      {
-        const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
-        const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
-        const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
-        const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
-
-        {
-          tmp0 = _mm_madd_epi16(lo_17, stg1_0);
-          tmp1 = _mm_madd_epi16(hi_17, stg1_0);
-          tmp2 = _mm_madd_epi16(lo_17, stg1_1);
-          tmp3 = _mm_madd_epi16(hi_17, stg1_1);
-          tmp4 = _mm_madd_epi16(lo_35, stg1_2);
-          tmp5 = _mm_madd_epi16(hi_35, stg1_2);
-          tmp6 = _mm_madd_epi16(lo_35, stg1_3);
-          tmp7 = _mm_madd_epi16(hi_35, stg1_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-          tmp6 = _mm_add_epi32(tmp6, rounding);
-          tmp7 = _mm_add_epi32(tmp7, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-          tmp4 = _mm_srai_epi32(tmp4, 14);
-          tmp5 = _mm_srai_epi32(tmp5, 14);
-          tmp6 = _mm_srai_epi32(tmp6, 14);
-          tmp7 = _mm_srai_epi32(tmp7, 14);
-
-          stp1_4 = _mm_packs_epi32(tmp0, tmp1);
-          stp1_7 = _mm_packs_epi32(tmp2, tmp3);
-          stp1_5 = _mm_packs_epi32(tmp4, tmp5);
-          stp1_6 = _mm_packs_epi32(tmp6, tmp7);
-        }
-      }
-
-      /* Stage2 */
-      {
-        const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
-        const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
-
-        {
-          tmp0 = _mm_unpacklo_epi16(in0, in4);
-          tmp1 = _mm_unpackhi_epi16(in0, in4);
-
-          tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-          tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-          tmp4 = _mm_madd_epi16(tmp0, stk2_1);
-          tmp5 = _mm_madd_epi16(tmp1, stk2_1);
-
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-          tmp4 = _mm_add_epi32(tmp4, rounding);
-          tmp5 = _mm_add_epi32(tmp5, rounding);
-
-          tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-          tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-          tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-          tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-          stp2_0 = _mm_packs_epi32(tmp2, tmp3);
-          stp2_1 = _mm_packs_epi32(tmp4, tmp5);
-
-          tmp0 = _mm_madd_epi16(lo_26, stg2_2);
-          tmp1 = _mm_madd_epi16(hi_26, stg2_2);
-          tmp2 = _mm_madd_epi16(lo_26, stg2_3);
-          tmp3 = _mm_madd_epi16(hi_26, stg2_3);
-
-          tmp0 = _mm_add_epi32(tmp0, rounding);
-          tmp1 = _mm_add_epi32(tmp1, rounding);
-          tmp2 = _mm_add_epi32(tmp2, rounding);
-          tmp3 = _mm_add_epi32(tmp3, rounding);
-
-          tmp0 = _mm_srai_epi32(tmp0, 14);
-          tmp1 = _mm_srai_epi32(tmp1, 14);
-          tmp2 = _mm_srai_epi32(tmp2, 14);
-          tmp3 = _mm_srai_epi32(tmp3, 14);
-
-          stp2_2 = _mm_packs_epi32(tmp0, tmp1);
-          stp2_3 = _mm_packs_epi32(tmp2, tmp3);
-        }
-
-        stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-        stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-        stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-        stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-      }
-
-      /* Stage3 */
-      {
-        stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-        stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-        stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-        stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-        tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-        tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-        tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-        tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-        tmp4 = _mm_madd_epi16(tmp0, stk2_0);
-        tmp5 = _mm_madd_epi16(tmp1, stk2_0);
-
-        tmp2 = _mm_add_epi32(tmp2, rounding);
-        tmp3 = _mm_add_epi32(tmp3, rounding);
-        tmp4 = _mm_add_epi32(tmp4, rounding);
-        tmp5 = _mm_add_epi32(tmp5, rounding);
-
-        tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-        tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-        tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
-        tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
-
-        stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-        stp1_6 = _mm_packs_epi32(tmp4, tmp5);
-      }
-
-      /* Stage4  */
-      in0 = _mm_add_epi16(stp1_0, stp2_7);
-      in1 = _mm_add_epi16(stp1_1, stp1_6);
-      in2 = _mm_add_epi16(stp1_2, stp1_5);
-      in3 = _mm_add_epi16(stp1_3, stp2_4);
-      in4 = _mm_sub_epi16(stp1_3, stp2_4);
-      in5 = _mm_sub_epi16(stp1_2, stp1_5);
-      in6 = _mm_sub_epi16(stp1_1, stp1_6);
-      in7 = _mm_sub_epi16(stp1_0, stp2_7);
-    }
-  }
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
+static INLINE void partial_butterfly_ssse3(const __m128i in, const int c0,
+                                           const int c1, __m128i *const out0,
+                                           __m128i *const out1) {
+  const __m128i cst0 = _mm_set1_epi16(2 * c0);
+  const __m128i cst1 = _mm_set1_epi16(2 * c1);
+  *out0 = _mm_mulhrs_epi16(in, cst0);
+  *out1 = _mm_mulhrs_epi16(in, cst1);
+}
 
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
+static INLINE __m128i partial_butterfly_cospi16_ssse3(const __m128i in) {
+  const __m128i coef_pair = _mm_set1_epi16(2 * cospi_16_64);
+  return _mm_mulhrs_epi16(in, coef_pair);
 }
 
 void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
                               int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i final_rounding = _mm_set1_epi16(1 << 4);
-  const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-  const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-  const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-  const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-  const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-  const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-
-  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-  __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
-  __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
-  __m128i tmp0, tmp1, tmp2, tmp3;
-
-  // Rows. Load 4-row input data.
-  in0 = load_input_data(input);
-  in1 = load_input_data(input + 8 * 1);
-  in2 = load_input_data(input + 8 * 2);
-  in3 = load_input_data(input + 8 * 3);
-
-  // 8x4 Transpose
-  TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
-
-  // Stage1
-  tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
-  tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
-  tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
-  tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
-
-  stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
-  stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
-
-  // Stage2
-  tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
-
-  tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
-  tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
-  stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
-
-  tmp0 = _mm_add_epi16(stp1_4, stp1_5);
-  tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
-
-  stp2_4 = tmp0;
-  stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
-  stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
-  tmp1 = _mm_madd_epi16(tmp0, stg3_0);
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);  // stg3_1 = stk2_0
-
-  tmp1 = _mm_add_epi32(tmp1, rounding);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-
-  stp1_5 = _mm_packs_epi32(tmp1, tmp2);
-
-  // Stage3
-  tmp2 = _mm_add_epi16(stp2_0, stp2_2);
-  tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
+  __m128i io[8];
 
-  stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
-  stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
-
-  // Stage4
-  tmp0 = _mm_add_epi16(stp1_3, stp2_4);
-  tmp1 = _mm_add_epi16(stp1_2, stp1_5);
-  tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
-  tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
-
-  TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
-
-  /* Stage1 */
-  stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
-  stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
-  stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
-  stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
-
-  /* Stage2 */
-  stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
-  stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
-
-  stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
-  stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
-
-  stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
-  stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
-  stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
-  stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
-
-  /* Stage3 */
-  stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
-  stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
-  stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
-  stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
-
-  tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
-  tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_0);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_0);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_6 = _mm_packs_epi32(tmp2, tmp3);
-
-  tmp2 = _mm_madd_epi16(tmp0, stk2_1);
-  tmp3 = _mm_madd_epi16(tmp1, stk2_1);
-  tmp2 = _mm_add_epi32(tmp2, rounding);
-  tmp3 = _mm_add_epi32(tmp3, rounding);
-  tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
-  tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
-  stp1_5 = _mm_packs_epi32(tmp2, tmp3);
-
-  /* Stage4  */
-  in0 = _mm_add_epi16(stp1_0, stp2_7);
-  in1 = _mm_add_epi16(stp1_1, stp1_6);
-  in2 = _mm_add_epi16(stp1_2, stp1_5);
-  in3 = _mm_add_epi16(stp1_3, stp2_4);
-  in4 = _mm_sub_epi16(stp1_3, stp2_4);
-  in5 = _mm_sub_epi16(stp1_2, stp1_5);
-  in6 = _mm_sub_epi16(stp1_1, stp1_6);
-  in7 = _mm_sub_epi16(stp1_0, stp2_7);
-
-  // Final rounding and shift
-  in0 = _mm_adds_epi16(in0, final_rounding);
-  in1 = _mm_adds_epi16(in1, final_rounding);
-  in2 = _mm_adds_epi16(in2, final_rounding);
-  in3 = _mm_adds_epi16(in3, final_rounding);
-  in4 = _mm_adds_epi16(in4, final_rounding);
-  in5 = _mm_adds_epi16(in5, final_rounding);
-  in6 = _mm_adds_epi16(in6, final_rounding);
-  in7 = _mm_adds_epi16(in7, final_rounding);
-
-  in0 = _mm_srai_epi16(in0, 5);
-  in1 = _mm_srai_epi16(in1, 5);
-  in2 = _mm_srai_epi16(in2, 5);
-  in3 = _mm_srai_epi16(in3, 5);
-  in4 = _mm_srai_epi16(in4, 5);
-  in5 = _mm_srai_epi16(in5, 5);
-  in6 = _mm_srai_epi16(in6, 5);
-  in7 = _mm_srai_epi16(in7, 5);
-
-  RECON_AND_STORE(dest + 0 * stride, in0);
-  RECON_AND_STORE(dest + 1 * stride, in1);
-  RECON_AND_STORE(dest + 2 * stride, in2);
-  RECON_AND_STORE(dest + 3 * stride, in3);
-  RECON_AND_STORE(dest + 4 * stride, in4);
-  RECON_AND_STORE(dest + 5 * stride, in5);
-  RECON_AND_STORE(dest + 6 * stride, in6);
-  RECON_AND_STORE(dest + 7 * stride, in7);
-}
+  io[0] = load_input_data4(input + 0 * 8);
+  io[1] = load_input_data4(input + 1 * 8);
+  io[2] = load_input_data4(input + 2 * 8);
+  io[3] = load_input_data4(input + 3 * 8);
 
-// Only do addition and subtraction butterfly, size = 16, 32
-static INLINE void add_sub_butterfly(const __m128i *in, __m128i *out,
-                                     int size) {
-  int i = 0;
-  const int num = size >> 1;
-  const int bound = size - 1;
-  while (i < num) {
-    out[i] = _mm_add_epi16(in[i], in[bound - i]);
-    out[bound - i] = _mm_sub_epi16(in[i], in[bound - i]);
-    i++;
-  }
+  idct8x8_12_add_kernel_ssse3(io);
+  write_buffer_8x8(io, dest, stride);
 }
 
-#define BUTTERFLY_PAIR(x0, x1, co0, co1)         \
-  do {                                           \
-    tmp0 = _mm_madd_epi16(x0, co0);              \
-    tmp1 = _mm_madd_epi16(x1, co0);              \
-    tmp2 = _mm_madd_epi16(x0, co1);              \
-    tmp3 = _mm_madd_epi16(x1, co1);              \
-    tmp0 = _mm_add_epi32(tmp0, rounding);        \
-    tmp1 = _mm_add_epi32(tmp1, rounding);        \
-    tmp2 = _mm_add_epi32(tmp2, rounding);        \
-    tmp3 = _mm_add_epi32(tmp3, rounding);        \
-    tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
-    tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
-    tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
-    tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
-  } while (0)
-
-static INLINE void butterfly(const __m128i *x0, const __m128i *x1,
-                             const __m128i *c0, const __m128i *c1, __m128i *y0,
-                             __m128i *y1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+// Group the coefficient calculation into smaller functions to prevent stack
+// spillover in 32x32 idct optimizations:
+// quarter_1: 0-7
+// quarter_2: 8-15
+// quarter_3_4: 16-23, 24-31
 
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *y0 = _mm_packs_epi32(tmp0, tmp1);
-  *y1 = _mm_packs_epi32(tmp2, tmp3);
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  step2[4] = step1[4];
+  step2[5] = step1[4];
+  step2[6] = step1[7];
+  step2[7] = step1[7];
+
+  // stage 5
+  step1[0] = step2[0];
+  step1[1] = step2[0];
+  step1[2] = step2[0];
+  step1[3] = step2[0];
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
 }
 
-static INLINE void butterfly_self(__m128i *x0, __m128i *x1, const __m128i *c0,
-                                  const __m128i *c1) {
-  __m128i tmp0, tmp1, tmp2, tmp3, u0, u1;
-  const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
-  u0 = _mm_unpacklo_epi16(*x0, *x1);
-  u1 = _mm_unpackhi_epi16(*x0, *x1);
-  BUTTERFLY_PAIR(u0, u1, *c0, *c1);
-  *x0 = _mm_packs_epi32(tmp0, tmp1);
-  *x1 = _mm_packs_epi32(tmp2, tmp3);
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                            __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = step2[8];
+  step1[9] = step2[8];
+  step1[14] = step2[15];
+  step1[15] = step2[15];
+  step1[10] = step2[11];
+  step1[11] = step2[11];
+  step1[12] = step2[12];
+  step1[13] = step2[12];
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
 }
 
-static void idct32_34_first_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-  const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-  const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-  const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-
-  const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-  const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-  const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i x0, x1, x4, x5, x6, x7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
-
-  // phase 1
-
-  // 0, 15
-  u2 = _mm_mulhrs_epi16(in[2], stk2_1);  // stp2_15
-  u3 = _mm_mulhrs_epi16(in[6], stk2_7);  // stp2_12
-  v15 = _mm_add_epi16(u2, u3);
-  // in[0], in[4]
-  x0 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[0]
-  x7 = _mm_mulhrs_epi16(in[4], stk3_1);  // stp1[7]
-  v0 = _mm_add_epi16(x0, x7);            // stp2_0
-  stp1[0] = _mm_add_epi16(v0, v15);
-  stp1[15] = _mm_sub_epi16(v0, v15);
-
-  // in[2], in[6]
-  u0 = _mm_mulhrs_epi16(in[2], stk2_0);             // stp2_8
-  u1 = _mm_mulhrs_epi16(in[6], stk2_6);             // stp2_11
-  butterfly(&u0, &u2, &stg4_4, &stg4_5, &u4, &u5);  // stp2_9, stp2_14
-  butterfly(&u1, &u3, &stg4_6, &stg4_4, &u6, &u7);  // stp2_10, stp2_13
-
-  v8 = _mm_add_epi16(u0, u1);
-  v9 = _mm_add_epi16(u4, u6);
-  v10 = _mm_sub_epi16(u4, u6);
-  v11 = _mm_sub_epi16(u0, u1);
-  v12 = _mm_sub_epi16(u2, u3);
-  v13 = _mm_sub_epi16(u5, u7);
-  v14 = _mm_add_epi16(u5, u7);
-
-  butterfly_self(&v10, &v13, &stg6_0, &stg4_0);
-  butterfly_self(&v11, &v12, &stg6_0, &stg4_0);
-
-  // 1, 14
-  x1 = _mm_mulhrs_epi16(in[0], stk4_0);  // stp1[1], stk4_1 = stk4_0
-  // stp1[2] = stp1[0], stp1[3] = stp1[1]
-  x4 = _mm_mulhrs_epi16(in[4], stk3_0);  // stp1[4]
-  butterfly(&x7, &x4, &stg4_1, &stg4_0, &x5, &x6);
-  v1 = _mm_add_epi16(x1, x6);  // stp2_1
-  v2 = _mm_add_epi16(x0, x5);  // stp2_2
-  stp1[1] = _mm_add_epi16(v1, v14);
-  stp1[14] = _mm_sub_epi16(v1, v14);
-
-  stp1[2] = _mm_add_epi16(v2, v13);
-  stp1[13] = _mm_sub_epi16(v2, v13);
-
-  v3 = _mm_add_epi16(x1, x4);  // stp2_3
-  v4 = _mm_sub_epi16(x1, x4);  // stp2_4
-
-  v5 = _mm_sub_epi16(x0, x5);  // stp2_5
-
-  v6 = _mm_sub_epi16(x1, x6);  // stp2_6
-  v7 = _mm_sub_epi16(x0, x7);  // stp2_7
-  stp1[3] = _mm_add_epi16(v3, v12);
-  stp1[12] = _mm_sub_epi16(v3, v12);
-
-  stp1[6] = _mm_add_epi16(v6, v9);
-  stp1[9] = _mm_sub_epi16(v6, v9);
-
-  stp1[7] = _mm_add_epi16(v7, v8);
-  stp1[8] = _mm_sub_epi16(v7, v8);
-
-  stp1[4] = _mm_add_epi16(v4, v11);
-  stp1[11] = _mm_sub_epi16(v4, v11);
-
-  stp1[5] = _mm_add_epi16(v5, v10);
-  stp1[10] = _mm_sub_epi16(v5, v10);
+static INLINE void idct32_34_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i temp[16];
+  idct32_34_8x32_quarter_1(in, temp);
+  idct32_34_8x32_quarter_2(in, temp);
+  // stage 7
+  add_sub_butterfly(temp, out, 16);
 }
 
-static void idct32_34_second_half(const __m128i *in, __m128i *stp1) {
-  const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-  const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-  const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-  const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-  const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-  const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-  const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-  const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-  const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-  const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-  const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-  const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-
-  const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  v16 = _mm_mulhrs_epi16(in[1], stk1_0);
-  v31 = _mm_mulhrs_epi16(in[1], stk1_1);
-
-  v19 = _mm_mulhrs_epi16(in[7], stk1_6);
-  v28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-  v20 = _mm_mulhrs_epi16(in[5], stk1_8);
-  v27 = _mm_mulhrs_epi16(in[5], stk1_9);
-
-  v23 = _mm_mulhrs_epi16(in[3], stk1_14);
-  v24 = _mm_mulhrs_epi16(in[3], stk1_15);
-
-  butterfly(&v16, &v31, &stg3_4, &stg3_5, &v17, &v30);
-  butterfly(&v19, &v28, &stg3_6, &stg3_4, &v18, &v29);
-  butterfly(&v20, &v27, &stg3_8, &stg3_9, &v21, &v26);
-  butterfly(&v23, &v24, &stg3_10, &stg3_8, &v22, &v25);
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-  u24 = _mm_add_epi16(v24, v27);
-  u27 = _mm_sub_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u28 = _mm_sub_epi16(v31, v28);
-  u31 = _mm_add_epi16(v28, v31);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-
-  butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-  butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-  butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-  butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-
-  stp1[16] = _mm_add_epi16(u16, u23);
-  stp1[23] = _mm_sub_epi16(u16, u23);
-
-  stp1[17] = _mm_add_epi16(u17, u22);
-  stp1[22] = _mm_sub_epi16(u17, u22);
-
-  stp1[18] = _mm_add_epi16(u18, u21);
-  stp1[21] = _mm_sub_epi16(u18, u21);
-
-  stp1[19] = _mm_add_epi16(u19, u20);
-  stp1[20] = _mm_sub_epi16(u19, u20);
-
-  stp1[24] = _mm_sub_epi16(u31, u24);
-  stp1[31] = _mm_add_epi16(u24, u31);
-
-  stp1[25] = _mm_sub_epi16(u30, u25);
-  stp1[30] = _mm_add_epi16(u25, u30);
-
-  stp1[26] = _mm_sub_epi16(u29, u26);
-  stp1[29] = _mm_add_epi16(u26, u29);
+// For each 8x32 block __m128i in[32],
+// Input with odd index, 1, 3, 5, 7
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_34_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 3
+  butterfly(step1[31], step1[16], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step1[28], step1[19], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  butterfly(step1[27], step1[20], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step1[24], step1[23], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
+}
 
-  stp1[27] = _mm_sub_epi16(u28, u27);
-  stp1[28] = _mm_add_epi16(u27, u28);
+void idct32_34_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                          __m128i *const out /*out[32]*/) {
+  __m128i temp[32];
 
-  butterfly_self(&stp1[20], &stp1[27], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[21], &stp1[26], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[22], &stp1[25], &stg6_0, &stg4_0);
-  butterfly_self(&stp1[23], &stp1[24], &stg6_0, &stg4_0);
+  idct32_34_8x32_quarter_1_2(in, temp);
+  idct32_34_8x32_quarter_3_4(in, temp);
+  // final stage
+  add_sub_butterfly(temp, out, 32);
 }
 
 // Only upper-left 8x8 has non-zero coeff
 void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
                                 int stride) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  __m128i in[32], col[32];
-  __m128i stp1[32];
+  __m128i io[32], col[32];
   int i;
 
   // Load input data. Only need to load the top left 8x8 block.
-  in[0] = load_input_data(input);
-  in[1] = load_input_data(input + 32);
-  in[2] = load_input_data(input + 64);
-  in[3] = load_input_data(input + 96);
-  in[4] = load_input_data(input + 128);
-  in[5] = load_input_data(input + 160);
-  in[6] = load_input_data(input + 192);
-  in[7] = load_input_data(input + 224);
+  load_transpose_16bit_8x8(input, 32, io);
+  idct32_34_8x32_ssse3(io, col);
 
-  array_transpose_8x8(in, in);
-  idct32_34_first_half(in, stp1);
-  idct32_34_second_half(in, stp1);
-
-  // 1_D: Store 32 intermediate results for each 8x32 block.
-  add_sub_butterfly(stp1, col, 32);
-  for (i = 0; i < 4; i++) {
+  for (i = 0; i < 32; i += 8) {
     int j;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + i * 8, in);
-    idct32_34_first_half(in, stp1);
-    idct32_34_second_half(in, stp1);
+    transpose_16bit_8x8(col + i, io);
+    idct32_34_8x32_ssse3(io, io);
 
-    // 2_D: Calculate the results and store them to destination.
-    add_sub_butterfly(stp1, in, 32);
     for (j = 0; j < 32; ++j) {
-      // Final rounding and shift
-      in[j] = _mm_adds_epi16(in[j], final_rounding);
-      in[j] = _mm_srai_epi16(in[j], 6);
-      RECON_AND_STORE(dest + j * stride, in[j]);
+      write_buffer_8x1(dest + j * stride, io[j]);
     }
 
     dest += 8;
   }
 }
 
-// in0[16] represents the left 8x16 block
-// in1[16] represents the right 8x16 block
-static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
-                              __m128i *in1) {
-  int i;
-  for (i = 0; i < 16; i++) {
-    in0[i] = load_input_data(input);
-    in1[i] = load_input_data(input + 8);
-    input += 32;
-  }
-}
-
-static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
-                                    __m128i *out1) {
-  array_transpose_8x8(in0, out0);
-  array_transpose_8x8(&in0[8], out1);
-  array_transpose_8x8(in1, &out0[8]);
-  array_transpose_8x8(&in1[8], &out1[8]);
-}
-
-// Group the coefficient calculation into smaller functions
-// to prevent stack spillover:
-// quarter_1: 0-7
-// quarter_2: 8-15
-// quarter_3_4: 16-23, 24-31
-static void idct32_8x32_135_quarter_1(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
-
-  {
-    const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
-    const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
-    const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
-    u0 = _mm_mulhrs_epi16(in[0], stk4_0);
-    u2 = _mm_mulhrs_epi16(in[8], stk4_2);
-    u3 = _mm_mulhrs_epi16(in[8], stk4_3);
-    u1 = u0;
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  {
-    const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
-    const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
-    const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
-    const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
-    u4 = _mm_mulhrs_epi16(in[4], stk3_0);
-    u7 = _mm_mulhrs_epi16(in[4], stk3_1);
-    u5 = _mm_mulhrs_epi16(in[12], stk3_2);
-    u6 = _mm_mulhrs_epi16(in[12], stk3_3);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-  }
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
+// For each 8x32 block __m128i in[32],
+// Input with index, 0, 4, 8, 12
+// output pixels: 0-7 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_1(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[8]*/) {
+  __m128i step1[8], step2[8];
+
+  // stage 3
+  partial_butterfly_ssse3(in[4], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  partial_butterfly_ssse3(in[12], -cospi_20_64, cospi_12_64, &step1[5],
+                          &step1[6]);
+
+  // stage 4
+  step2[0] = partial_butterfly_cospi16_ssse3(in[0]);
+  partial_butterfly_ssse3(in[8], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm_add_epi16(step1[0], step1[7]);
+  out[1] = _mm_add_epi16(step1[1], step1[6]);
+  out[2] = _mm_add_epi16(step1[2], step1[5]);
+  out[3] = _mm_add_epi16(step1[3], step1[4]);
+  out[4] = _mm_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm_sub_epi16(step1[0], step1[7]);
 }
 
-static void idct32_8x32_135_quarter_2(const __m128i *in /*in[16]*/,
-                                      __m128i *out /*out[8]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;
-
-  {
-    const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
-    const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
-    const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
-    const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
-    const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
-    const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
-    const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
-    const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
-    u8 = _mm_mulhrs_epi16(in[2], stk2_0);
-    u15 = _mm_mulhrs_epi16(in[2], stk2_1);
-    u9 = _mm_mulhrs_epi16(in[14], stk2_2);
-    u14 = _mm_mulhrs_epi16(in[14], stk2_3);
-    u10 = _mm_mulhrs_epi16(in[10], stk2_4);
-    u13 = _mm_mulhrs_epi16(in[10], stk2_5);
-    u11 = _mm_mulhrs_epi16(in[6], stk2_6);
-    u12 = _mm_mulhrs_epi16(in[6], stk2_7);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
+// For each 8x32 block __m128i in[32],
+// Input with index, 2, 6, 10, 14
+// output pixels: 8-15 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_2(const __m128i *const in /*in[32]*/,
+                                             __m128i *const out /*out[16]*/) {
+  __m128i step1[16], step2[16];
+
+  // stage 2
+  partial_butterfly_ssse3(in[2], cospi_30_64, cospi_2_64, &step2[8],
+                          &step2[15]);
+  partial_butterfly_ssse3(in[14], -cospi_18_64, cospi_14_64, &step2[9],
+                          &step2[14]);
+  partial_butterfly_ssse3(in[10], cospi_22_64, cospi_10_64, &step2[10],
+                          &step2[13]);
+  partial_butterfly_ssse3(in[6], -cospi_26_64, cospi_6_64, &step2[11],
+                          &step2[12]);
+
+  // stage 3
+  step1[8] = _mm_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm_add_epi16(step2[15], step2[14]);
+
+  idct32_8x32_quarter_2_stage_4_to_6(step1, out);
 }
 
-// 8x32 block even indexed 8 inputs of in[16],
-// output first half 16 to out[32]
-static void idct32_8x32_quarter_1_2(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
+static INLINE void idct32_135_8x32_quarter_1_2(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
   __m128i temp[16];
-  idct32_8x32_135_quarter_1(in, temp);
-  idct32_8x32_135_quarter_2(in, &temp[8]);
+  idct32_135_8x32_quarter_1(in, temp);
+  idct32_135_8x32_quarter_2(in, temp);
+  // stage 7
   add_sub_butterfly(temp, out, 16);
 }
 
-// 8x32 block odd indexed 8 inputs of in[16],
-// output second half 16 to out[32]
-static void idct32_8x32_quarter_3_4(const __m128i *in /*in[16]*/,
-                                    __m128i *out /*out[32]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
-    const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
-    const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
-    const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
-
-    const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
-    const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
-    const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
-    const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
-    const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
-    const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
-    const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
-    const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
-
-    const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
-    const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
-    const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
-    const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
-    u16 = _mm_mulhrs_epi16(in[1], stk1_0);
-    u31 = _mm_mulhrs_epi16(in[1], stk1_1);
-    u17 = _mm_mulhrs_epi16(in[15], stk1_2);
-    u30 = _mm_mulhrs_epi16(in[15], stk1_3);
-
-    u18 = _mm_mulhrs_epi16(in[9], stk1_4);
-    u29 = _mm_mulhrs_epi16(in[9], stk1_5);
-    u19 = _mm_mulhrs_epi16(in[7], stk1_6);
-    u28 = _mm_mulhrs_epi16(in[7], stk1_7);
-
-    u20 = _mm_mulhrs_epi16(in[5], stk1_8);
-    u27 = _mm_mulhrs_epi16(in[5], stk1_9);
-    u21 = _mm_mulhrs_epi16(in[11], stk1_10);
-    u26 = _mm_mulhrs_epi16(in[11], stk1_11);
-
-    u22 = _mm_mulhrs_epi16(in[13], stk1_12);
-    u25 = _mm_mulhrs_epi16(in[13], stk1_13);
-    u23 = _mm_mulhrs_epi16(in[3], stk1_14);
-    u24 = _mm_mulhrs_epi16(in[3], stk1_15);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  v20 = _mm_sub_epi16(u19, u20);
-  v21 = _mm_sub_epi16(u18, u21);
-  v22 = _mm_sub_epi16(u17, u22);
-  v23 = _mm_sub_epi16(u16, u23);
-
-  v24 = _mm_sub_epi16(u31, u24);
-  v25 = _mm_sub_epi16(u30, u25);
-  v26 = _mm_sub_epi16(u29, u26);
-  v27 = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly(&v20, &v27, &stg6_0, &stg4_0, &out[4], &out[11]);
-    butterfly(&v21, &v26, &stg6_0, &stg4_0, &out[5], &out[10]);
-    butterfly(&v22, &v25, &stg6_0, &stg4_0, &out[6], &out[9]);
-    butterfly(&v23, &v24, &stg6_0, &stg4_0, &out[7], &out[8]);
-  }
-}
-
-// 8x16 block, input __m128i in[16], output __m128i in[32]
-static void idct32_8x32_135(__m128i *in /*in[32]*/) {
-  __m128i out[32];
-  idct32_8x32_quarter_1_2(in, out);
-  idct32_8x32_quarter_3_4(in, &out[16]);
-  add_sub_butterfly(out, in, 32);
-}
-
-static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
-  const __m128i final_rounding = _mm_set1_epi16(1 << 5);
-  const __m128i zero = _mm_setzero_si128();
-  int j = 0;
-  while (j < 32) {
-    in[j] = _mm_adds_epi16(in[j], final_rounding);
-    in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
-
-    in[j] = _mm_srai_epi16(in[j], 6);
-    in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
-
-    RECON_AND_STORE(dst, in[j]);
-    dst += stride;
-    RECON_AND_STORE(dst, in[j + 1]);
-    dst += stride;
-    j += 2;
-  }
-}
-
-static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
-                                   int stride) {
-  store_buffer_8x32(in0, dest, stride);
-  store_buffer_8x32(in1, dest + 8, stride);
-}
-
-static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
-  idct32_8x32_135(col0);
-  idct32_8x32_135(col1);
-}
-
-typedef enum { left_16, right_16 } ColsIndicator;
-
-static void transpose_and_copy_16x16(__m128i *in0, __m128i *in1, __m128i *store,
-                                     ColsIndicator cols) {
-  switch (cols) {
-    case left_16: {
-      int i;
-      array_transpose_16x16(in0, in1);
-      for (i = 0; i < 16; ++i) {
-        store[i] = in0[16 + i];
-        store[16 + i] = in1[16 + i];
-      }
-      break;
-    }
-    case right_16: {
-      array_transpose_16x16_2(store, &store[16], in0, in1);
-      break;
-    }
-    default: { assert(0); }
-  }
-}
-
-// Only upper-left 16x16 has non-zero coeff
-void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                 int stride) {
-  // Each array represents an 8x32 block
-  __m128i col0[32], col1[32];
-  // This array represents a 16x16 block
-  __m128i temp[32];
-
-  // Load input data. Only need to load the top left 16x16 block.
-  load_buffer_16x16(input, col0, col1);
-
-  // columns
-  array_transpose_16x16(col0, col1);
-  idct32_135(col0, col1);
-
-  // rows
-  transpose_and_copy_16x16(col0, col1, temp, left_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest, stride);
-
-  transpose_and_copy_16x16(col0, col1, temp, right_16);
-  idct32_135(col0, col1);
-  recon_and_store(col0, col1, dest + 16, stride);
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
-// output pixels: 8-15 in __m128i in[32]
-static void idct32_full_8x32_quarter_2(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[16]*/) {
-  __m128i u8, u9, u10, u11, u12, u13, u14, u15;  // stp2_
-  __m128i v8, v9, v10, v11, v12, v13, v14, v15;  // stp1_
-
-  {
-    const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
-    const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
-    const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
-    const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
-    butterfly(&in[2], &in[30], &stg2_0, &stg2_1, &u8, &u15);
-    butterfly(&in[18], &in[14], &stg2_2, &stg2_3, &u9, &u14);
-  }
-
-  v8 = _mm_add_epi16(u8, u9);
-  v9 = _mm_sub_epi16(u8, u9);
-  v14 = _mm_sub_epi16(u15, u14);
-  v15 = _mm_add_epi16(u15, u14);
-
-  {
-    const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
-    const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
-    const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
-    const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
-    butterfly(&in[10], &in[22], &stg2_4, &stg2_5, &u10, &u13);
-    butterfly(&in[26], &in[6], &stg2_6, &stg2_7, &u11, &u12);
-  }
-
-  v10 = _mm_sub_epi16(u11, u10);
-  v11 = _mm_add_epi16(u11, u10);
-  v12 = _mm_add_epi16(u12, u13);
-  v13 = _mm_sub_epi16(u12, u13);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&v9, &v14, &stg4_4, &stg4_5);
-    butterfly_self(&v10, &v13, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(v8, v11);
-  out[1] = _mm_add_epi16(v9, v10);
-  out[6] = _mm_add_epi16(v14, v13);
-  out[7] = _mm_add_epi16(v15, v12);
-
-  out[2] = _mm_sub_epi16(v9, v10);
-  out[3] = _mm_sub_epi16(v8, v11);
-  out[4] = _mm_sub_epi16(v15, v12);
-  out[5] = _mm_sub_epi16(v14, v13);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[2], &out[5], &stg6_0, &stg4_0);
-    butterfly_self(&out[3], &out[4], &stg6_0, &stg4_0);
-  }
-}
-
-// For each 8x32 block __m128i in[32],
-// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
-// output pixels: 0-7 in __m128i in[32]
-static void idct32_full_8x32_quarter_1(const __m128i *in /*in[32]*/,
-                                       __m128i *out /*out[8]*/) {
-  __m128i u0, u1, u2, u3, u4, u5, u6, u7;  // stp1_
-  __m128i v0, v1, v2, v3, v4, v5, v6, v7;  // stp2_
-
-  {
-    const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
-    const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
-    const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
-    const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
-    butterfly(&in[4], &in[28], &stg3_0, &stg3_1, &u4, &u7);
-    butterfly(&in[20], &in[12], &stg3_2, &stg3_3, &u5, &u6);
-  }
-
-  v4 = _mm_add_epi16(u4, u5);
-  v5 = _mm_sub_epi16(u4, u5);
-  v6 = _mm_sub_epi16(u7, u6);
-  v7 = _mm_add_epi16(u7, u6);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-    const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
-    const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
-    butterfly(&v6, &v5, &stg4_1, &stg4_0, &v5, &v6);
-
-    butterfly(&in[0], &in[16], &stg4_0, &stg4_1, &u0, &u1);
-    butterfly(&in[8], &in[24], &stg4_2, &stg4_3, &u2, &u3);
-  }
-
-  v0 = _mm_add_epi16(u0, u3);
-  v1 = _mm_add_epi16(u1, u2);
-  v2 = _mm_sub_epi16(u1, u2);
-  v3 = _mm_sub_epi16(u0, u3);
-
-  out[0] = _mm_add_epi16(v0, v7);
-  out[1] = _mm_add_epi16(v1, v6);
-  out[2] = _mm_add_epi16(v2, v5);
-  out[3] = _mm_add_epi16(v3, v4);
-  out[4] = _mm_sub_epi16(v3, v4);
-  out[5] = _mm_sub_epi16(v2, v5);
-  out[6] = _mm_sub_epi16(v1, v6);
-  out[7] = _mm_sub_epi16(v0, v7);
-}
-
 // For each 8x32 block __m128i in[32],
 // Input with odd index,
-// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
-// output pixels: 16-23, 24-31 in __m128i in[32]
-// We avoid hide an offset, 16, inside this function. So we output 0-15 into
-// array out[16]
-static void idct32_full_8x32_quarter_3_4(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[16]*/) {
-  __m128i v16, v17, v18, v19, v20, v21, v22, v23;
-  __m128i v24, v25, v26, v27, v28, v29, v30, v31;
-  __m128i u16, u17, u18, u19, u20, u21, u22, u23;
-  __m128i u24, u25, u26, u27, u28, u29, u30, u31;
-
-  {
-    const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
-    const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
-    const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
-    const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
-    const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
-    const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
-    const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
-    const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
-    const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
-    const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
-    const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
-    const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
-    const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
-    const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
-    const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
-    const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
-    butterfly(&in[1], &in[31], &stg1_0, &stg1_1, &u16, &u31);
-    butterfly(&in[17], &in[15], &stg1_2, &stg1_3, &u17, &u30);
-    butterfly(&in[9], &in[23], &stg1_4, &stg1_5, &u18, &u29);
-    butterfly(&in[25], &in[7], &stg1_6, &stg1_7, &u19, &u28);
-
-    butterfly(&in[5], &in[27], &stg1_8, &stg1_9, &u20, &u27);
-    butterfly(&in[21], &in[11], &stg1_10, &stg1_11, &u21, &u26);
-
-    butterfly(&in[13], &in[19], &stg1_12, &stg1_13, &u22, &u25);
-    butterfly(&in[29], &in[3], &stg1_14, &stg1_15, &u23, &u24);
-  }
-
-  v16 = _mm_add_epi16(u16, u17);
-  v17 = _mm_sub_epi16(u16, u17);
-  v18 = _mm_sub_epi16(u19, u18);
-  v19 = _mm_add_epi16(u19, u18);
-
-  v20 = _mm_add_epi16(u20, u21);
-  v21 = _mm_sub_epi16(u20, u21);
-  v22 = _mm_sub_epi16(u23, u22);
-  v23 = _mm_add_epi16(u23, u22);
-
-  v24 = _mm_add_epi16(u24, u25);
-  v25 = _mm_sub_epi16(u24, u25);
-  v26 = _mm_sub_epi16(u27, u26);
-  v27 = _mm_add_epi16(u27, u26);
-
-  v28 = _mm_add_epi16(u28, u29);
-  v29 = _mm_sub_epi16(u28, u29);
-  v30 = _mm_sub_epi16(u31, u30);
-  v31 = _mm_add_epi16(u31, u30);
-
-  {
-    const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-    const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
-    const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
-    const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-    const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
-    const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
-    butterfly_self(&v17, &v30, &stg3_4, &stg3_5);
-    butterfly_self(&v18, &v29, &stg3_6, &stg3_4);
-    butterfly_self(&v21, &v26, &stg3_8, &stg3_9);
-    butterfly_self(&v22, &v25, &stg3_10, &stg3_8);
-  }
-
-  u16 = _mm_add_epi16(v16, v19);
-  u17 = _mm_add_epi16(v17, v18);
-  u18 = _mm_sub_epi16(v17, v18);
-  u19 = _mm_sub_epi16(v16, v19);
-  u20 = _mm_sub_epi16(v23, v20);
-  u21 = _mm_sub_epi16(v22, v21);
-  u22 = _mm_add_epi16(v22, v21);
-  u23 = _mm_add_epi16(v23, v20);
-
-  u24 = _mm_add_epi16(v24, v27);
-  u25 = _mm_add_epi16(v25, v26);
-  u26 = _mm_sub_epi16(v25, v26);
-  u27 = _mm_sub_epi16(v24, v27);
-
-  u28 = _mm_sub_epi16(v31, v28);
-  u29 = _mm_sub_epi16(v30, v29);
-  u30 = _mm_add_epi16(v29, v30);
-  u31 = _mm_add_epi16(v28, v31);
-
-  {
-    const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-    const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
-    const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
-    butterfly_self(&u18, &u29, &stg4_4, &stg4_5);
-    butterfly_self(&u19, &u28, &stg4_4, &stg4_5);
-    butterfly_self(&u20, &u27, &stg4_6, &stg4_4);
-    butterfly_self(&u21, &u26, &stg4_6, &stg4_4);
-  }
-
-  out[0] = _mm_add_epi16(u16, u23);
-  out[1] = _mm_add_epi16(u17, u22);
-  out[2] = _mm_add_epi16(u18, u21);
-  out[3] = _mm_add_epi16(u19, u20);
-  out[4] = _mm_sub_epi16(u19, u20);
-  out[5] = _mm_sub_epi16(u18, u21);
-  out[6] = _mm_sub_epi16(u17, u22);
-  out[7] = _mm_sub_epi16(u16, u23);
-
-  out[8] = _mm_sub_epi16(u31, u24);
-  out[9] = _mm_sub_epi16(u30, u25);
-  out[10] = _mm_sub_epi16(u29, u26);
-  out[11] = _mm_sub_epi16(u28, u27);
-  out[12] = _mm_add_epi16(u27, u28);
-  out[13] = _mm_add_epi16(u26, u29);
-  out[14] = _mm_add_epi16(u25, u30);
-  out[15] = _mm_add_epi16(u24, u31);
-
-  {
-    const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
-    const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-    butterfly_self(&out[4], &out[11], &stg6_0, &stg4_0);
-    butterfly_self(&out[5], &out[10], &stg6_0, &stg4_0);
-    butterfly_self(&out[6], &out[9], &stg6_0, &stg4_0);
-    butterfly_self(&out[7], &out[8], &stg6_0, &stg4_0);
-  }
-}
-
-static void idct32_full_8x32_quarter_1_2(const __m128i *in /*in[32]*/,
-                                         __m128i *out /*out[32]*/) {
-  __m128i temp[16];
-  idct32_full_8x32_quarter_1(in, temp);
-  idct32_full_8x32_quarter_2(in, &temp[8]);
-  add_sub_butterfly(temp, out, 16);
+// 1, 3, 5, 7, 9, 11, 13, 15
+// output pixels: 16-23, 24-31 in __m128i out[32]
+static INLINE void idct32_135_8x32_quarter_3_4(
+    const __m128i *const in /*in[32]*/, __m128i *const out /*out[32]*/) {
+  __m128i step1[32], step2[32];
+
+  // stage 1
+  partial_butterfly_ssse3(in[1], cospi_31_64, cospi_1_64, &step1[16],
+                          &step1[31]);
+  partial_butterfly_ssse3(in[15], -cospi_17_64, cospi_15_64, &step1[17],
+                          &step1[30]);
+  partial_butterfly_ssse3(in[9], cospi_23_64, cospi_9_64, &step1[18],
+                          &step1[29]);
+  partial_butterfly_ssse3(in[7], -cospi_25_64, cospi_7_64, &step1[19],
+                          &step1[28]);
+
+  partial_butterfly_ssse3(in[5], cospi_27_64, cospi_5_64, &step1[20],
+                          &step1[27]);
+  partial_butterfly_ssse3(in[11], -cospi_21_64, cospi_11_64, &step1[21],
+                          &step1[26]);
+
+  partial_butterfly_ssse3(in[13], cospi_19_64, cospi_13_64, &step1[22],
+                          &step1[25]);
+  partial_butterfly_ssse3(in[3], -cospi_29_64, cospi_3_64, &step1[23],
+                          &step1[24]);
+
+  // stage 2
+  step2[16] = _mm_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+            &step1[30]);
+  butterfly(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+            &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+            &step1[26]);
+  butterfly(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+            &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_8x32_quarter_3_4_stage_4_to_7(step1, out);
 }
 
-static void idct32_full_8x32(const __m128i *in /*in[32]*/,
-                             __m128i *out /*out[32]*/) {
+void idct32_135_8x32_ssse3(const __m128i *const in /*in[32]*/,
+                           __m128i *const out /*out[32]*/) {
   __m128i temp[32];
-  idct32_full_8x32_quarter_1_2(in, temp);
-  idct32_full_8x32_quarter_3_4(in, &temp[16]);
+  idct32_135_8x32_quarter_1_2(in, temp);
+  idct32_135_8x32_quarter_3_4(in, temp);
+  // final stage
   add_sub_butterfly(temp, out, 32);
 }
 
-static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m128i col[2][32], io[32];
   int i;
-  for (i = 0; i < 8; ++i) {
-    in[i] = load_input_data(input);
-    in[i + 8] = load_input_data(input + 8);
-    in[i + 16] = load_input_data(input + 16);
-    in[i + 24] = load_input_data(input + 24);
-    input += 32;
-  }
-}
-
-void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
-                                  int stride) {
-  __m128i col[128], in[32];
-  int i, j;
 
   // rows
-  for (i = 0; i < 4; ++i) {
-    load_buffer_8x32(input, in);
+  for (i = 0; i < 2; i++) {
+    load_transpose_16bit_8x8(&input[0], 32, &io[0]);
+    load_transpose_16bit_8x8(&input[8], 32, &io[8]);
+    idct32_135_8x32_ssse3(io, col[i]);
     input += 32 << 3;
-
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(in, in);
-    array_transpose_8x8(in + 8, in + 8);
-    array_transpose_8x8(in + 16, in + 16);
-    array_transpose_8x8(in + 24, in + 24);
-
-    idct32_full_8x32(in, col + (i << 5));
   }
 
   // columns
-  for (i = 0; i < 4; ++i) {
-    j = i << 3;
-    // Transpose 32x8 block to 8x32 block
-    array_transpose_8x8(col + j, in);
-    array_transpose_8x8(col + j + 32, in + 8);
-    array_transpose_8x8(col + j + 64, in + 16);
-    array_transpose_8x8(col + j + 96, in + 24);
-
-    idct32_full_8x32(in, in);
-    store_buffer_8x32(in, dest, stride);
+  for (i = 0; i < 32; i += 8) {
+    transpose_16bit_8x8(col[0] + i, io);
+    transpose_16bit_8x8(col[1] + i, io + 8);
+    idct32_135_8x32_ssse3(io, io);
+    store_buffer_8x32(io, dest, stride);
     dest += 8;
   }
 }
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
new file mode 100644
index 000000000..e785c8eda
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/inv_txfm_ssse3.h
@@ -0,0 +1,110 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_INV_TXFM_SSSE3_H_
+#define VPX_DSP_X86_INV_TXFM_SSSE3_H_
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void idct8x8_12_add_kernel_ssse3(__m128i *const io /* io[8] */) {
+  const __m128i cp_28d_4d = dual_set_epi16(2 * cospi_28_64, 2 * cospi_4_64);
+  const __m128i cp_n20d_12d = dual_set_epi16(-2 * cospi_20_64, 2 * cospi_12_64);
+  const __m128i cp_8d_24d = dual_set_epi16(2 * cospi_8_64, 2 * cospi_24_64);
+  const __m128i cp_16_16 = _mm_set1_epi16(cospi_16_64);
+  const __m128i cp_16_n16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i cospi_16_64d = _mm_set1_epi16((int16_t)(2 * cospi_16_64));
+  const __m128i cospi_28_64d = _mm_set1_epi16((int16_t)(2 * cospi_28_64));
+  const __m128i cospi_4_64d = _mm_set1_epi16((int16_t)(2 * cospi_4_64));
+  const __m128i cospi_n20_64d = _mm_set1_epi16((int16_t)(-2 * cospi_20_64));
+  const __m128i cospi_12_64d = _mm_set1_epi16((int16_t)(2 * cospi_12_64));
+  const __m128i cospi_24_64d = _mm_set1_epi16((int16_t)(2 * cospi_24_64));
+  const __m128i cospi_8_64d = _mm_set1_epi16((int16_t)(2 * cospi_8_64));
+  __m128i step1[8], step2[8], tmp[4];
+
+  // pass 1
+
+  transpose_16bit_4x4(io, io);
+  // io[0]: 00 10 20 30  01 11 21 31
+  // io[1]: 02 12 22 32  03 13 23 33
+
+  // stage 1
+  tmp[0] = _mm_unpacklo_epi64(io[0], io[0]);
+  tmp[1] = _mm_unpackhi_epi64(io[0], io[0]);
+  tmp[2] = _mm_unpacklo_epi64(io[1], io[1]);
+  tmp[3] = _mm_unpackhi_epi64(io[1], io[1]);
+  step1[4] = _mm_mulhrs_epi16(tmp[1], cp_28d_4d);    // step1 4&7
+  step1[5] = _mm_mulhrs_epi16(tmp[3], cp_n20d_12d);  // step1 5&6
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(tmp[0], cospi_16_64d);  // step2 0&1
+  step2[2] = _mm_mulhrs_epi16(tmp[2], cp_8d_24d);     // step2 3&2
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);       // step2 4&7
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);       // step2 5&6
+  step2[6] = _mm_unpackhi_epi64(step2[5], step2[5]);  // step2 6
+
+  // stage 3
+  tmp[0] = _mm_unpacklo_epi16(step2[6], step2[5]);
+  step1[5] = idct_calc_wraplow_sse2(cp_16_n16, cp_16_16, tmp[0]);  // step1 5&6
+  tmp[0] = _mm_add_epi16(step2[0], step2[2]);                      // step1 0&1
+  tmp[1] = _mm_sub_epi16(step2[0], step2[2]);                      // step1 3&2
+  step1[2] = _mm_unpackhi_epi64(tmp[1], tmp[0]);                   // step1 2&1
+  step1[3] = _mm_unpacklo_epi64(tmp[1], tmp[0]);                   // step1 3&0
+
+  // stage 4
+  tmp[0] = _mm_add_epi16(step1[3], step2[4]);  // output 3&0
+  tmp[1] = _mm_add_epi16(step1[2], step1[5]);  // output 2&1
+  tmp[2] = _mm_sub_epi16(step1[3], step2[4]);  // output 4&7
+  tmp[3] = _mm_sub_epi16(step1[2], step1[5]);  // output 5&6
+
+  // pass 2
+
+  idct8x8_12_transpose_16bit_4x8(tmp, io);
+
+  // stage 1
+  step1[4] = _mm_mulhrs_epi16(io[1], cospi_28_64d);
+  step1[7] = _mm_mulhrs_epi16(io[1], cospi_4_64d);
+  step1[5] = _mm_mulhrs_epi16(io[3], cospi_n20_64d);
+  step1[6] = _mm_mulhrs_epi16(io[3], cospi_12_64d);
+
+  // stage 2
+  step2[0] = _mm_mulhrs_epi16(io[0], cospi_16_64d);  // step2[1] = step2[0]
+  step2[2] = _mm_mulhrs_epi16(io[2], cospi_24_64d);
+  step2[3] = _mm_mulhrs_epi16(io[2], cospi_8_64d);
+  step2[4] = _mm_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm_add_epi16(step1[7], step1[6]);
+
+  // stage 3
+  step1[0] = _mm_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm_add_epi16(step2[0], step2[2]);
+  step1[2] = _mm_sub_epi16(step2[0], step2[2]);
+  step1[3] = _mm_sub_epi16(step2[0], step2[3]);
+  butterfly(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5], &step1[6]);
+
+  // stage 4
+  io[0] = _mm_add_epi16(step1[0], step2[7]);
+  io[1] = _mm_add_epi16(step1[1], step1[6]);
+  io[2] = _mm_add_epi16(step1[2], step1[5]);
+  io[3] = _mm_add_epi16(step1[3], step2[4]);
+  io[4] = _mm_sub_epi16(step1[3], step2[4]);
+  io[5] = _mm_sub_epi16(step1[2], step1[5]);
+  io[6] = _mm_sub_epi16(step1[1], step1[6]);
+  io[7] = _mm_sub_epi16(step1[0], step2[7]);
+}
+
+void idct32_135_8x32_ssse3(const __m128i *const in, __m128i *const out);
+
+#endif  // VPX_DSP_X86_INV_TXFM_SSSE3_H_
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
new file mode 100644
index 000000000..2ce738fb7
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -0,0 +1,124 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_MEM_SSE2_H_
+#define VPX_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif  // VPX_DSP_X86_MEM_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
new file mode 100644
index 000000000..6f4489004
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -0,0 +1,315 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        int skip_block, const int16_t *zbin_ptr,
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan_ptr,
+                        const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  *eob_ptr = 0;
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_avx(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)n_coeffs;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  shift = _mm_slli_epi16(shift, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    // Un-sign to bias rounding like C.
+    // dequant is almost always negative, so this is probably the backwards way
+    // to handle the sign. However, it matches the previous assembly.
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    // "Divide" by 2.
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
deleted file mode 100644
index 01c41291b..000000000
--- a/libvpx/vpx_dsp/x86/quantize_avx_x86_64.asm
+++ /dev/null
@@ -1,544 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  vzeroupper
-
-  ; If we can skip this block, then just zero the output
-  cmp                         skipmp, 0
-  jne .blank
-
-%ifnidn %1, b_32x32
-
-  ; Special case for ncoeff == 16, as it is frequent and we can save on
-  ; not setting up a loop.
-  cmp                       ncoeffmp, 16
-  jne .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Special case of ncoeff == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.single:
-
-  movifnidn                   coeffq, coeffmp
-  movifnidn                    zbinq, zbinmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-
-  ; Get DC and first 15 AC coeffs - in this special case, that is all.
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
-  mova                            m9, [coeffq]
-  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
-  mova                           m10, [coeffq+32]
-  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
-%else
-  mova                            m9, [coeffq]             ; m9 = c[i]
-  mova                           m10, [coeffq+16]          ; m10 = c[i]
-%endif
-
-  mov                             r0, eobmp                ; Output pointer
-  mov                             r1, qcoeffmp             ; Output pointer
-  mov                             r2, dqcoeffmp            ; Output pointer
-
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  pcmpeqw                         m4, m4                   ; All word lanes -1
-  paddw                           m0, m4                   ; m0 = zbin - 1
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, we just write zeros
-  ; to the outputs and we are done.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .single_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova                       [r1   ], ymm5
-  mova                       [r1+32], ymm5
-  mova                       [r2   ], ymm5
-  mova                       [r2+32], ymm5
-%else
-  mova                          [r1], ymm5
-  mova                          [r2], ymm5
-%endif
-  mov                           [r0], word 0
-
-  vzeroupper
-  RET
-
-.single_nonzero:
-
-  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
-  movifnidn                       r4, roundmp
-  movifnidn                       r5, quantmp
-  mov                             r3, dequantmp
-  mov                             r6, shiftmp
-  mova                            m1, [r4]              ; m1 = round
-  mova                            m2, [r5]              ; m2 = quant
-  mova                            m3, [r3]              ; m3 = dequant
-  mova                            m4, [r6]              ; m4 = shift
-
-  mov                             r3, iscanmp
-
-  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                  [qcoeffq   ], m11
-  mova                  [qcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                  [qcoeffq+32], m11
-  mova                  [qcoeffq+48], m6
-%else
-  mova                  [qcoeffq   ], m8
-  mova                  [qcoeffq+16], m13
-%endif
-
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova                 [dqcoeffq   ], m11
-  mova                 [dqcoeffq+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova                 [dqcoeffq+32], m11
-  mova                 [dqcoeffq+48], m6
-%else
-  mova                 [dqcoeffq   ], m8
-  mova                 [dqcoeffq+16], m13
-%endif
-
-  mova                            m6, [iscanq]            ; m6 = scan[i]
-  mova                           m11, [iscanq+16]         ; m11 = scan[i]
-
-  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
-  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
-  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
-  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
-  pandn                           m8,  m8,  m6            ; m8 = max(eob)
-  pandn                          m13, m13, m11            ; m13 = max(eob)
-  pmaxsw                          m8,  m8, m13
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                         [eobq], ax
-
-  vzeroupper
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of ncoeff != 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-.generic:
-
-%endif ; %ifnidn %1, b_32x32
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  ; Actual quantization loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mova                            m3, [r2]                 ; m3 = dequant
-  pcmpeqw                         m4, m4                   ; All lanes -1
-%ifidn %1, b_32x32
-  psubw                           m0, m4
-  psubw                           m1, m4
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  paddw                           m0, m4                   ; m0 = m0 + 1
-
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .first_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4   ], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4   ], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova           [qcoeffq+ncoeffq*2], ymm5
-  mova          [dqcoeffq+ncoeffq*2], ymm5
-%endif
-
-  add                        ncoeffq, mmsize
-
-  punpckhqdq                      m1, m1
-  punpckhqdq                      m2, m2
-  punpckhqdq                      m3, m3
-  punpckhqdq                      m4, m4
-  pxor                            m8, m8
-
-  jmp .ac_only_loop
-
-.first_nonzero:
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m8
-  punpckhwd                       m6, m8, m6
-  pmovsxwd                       m11, m8
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                    ; m8 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-
-.ac_only_loop:
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [coeffq+ncoeffq*4+16]
-  mova                           m10, [coeffq+ncoeffq*4+32]
-  packssdw                       m10, [coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-
-  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
-  ; And just write zeros as the result would be.
-  por                            m14, m7, m12
-  ptest                          m14, m14
-  jnz .rest_nonzero
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
-  mova        [qcoeffq+ncoeffq*4+32], ymm5
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*4+32], ymm5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm5
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm5
-%endif
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-.rest_nonzero:
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  pcmpgtw                         m6, m5, m14
-  punpckhwd                       m6, m14, m6
-  pmovsxwd                       m11, m14
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pcmpgtw                         m6, m5, m13
-  punpckhwd                       m6, m13, m6
-  pmovsxwd                       m11, m13
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-
-  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
-  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                    ; m6 = scan[i] + 1
-  psubw                          m11, m12                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                    ; m14 = max(eob)
-  pandn                          m13, m11                   ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jnz .ac_only_loop
-
-  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  movq                           rax, m8
-  mov                           [r2], ax
-  vzeroupper
-  RET
-
-  ; Skip-block, i.e. just write all zeroes
-.blank:
-
-DEFINE_ARGS coeff, ncoeff, skip, zbin, round, quant, shift, \
-            qcoeff, dqcoeff, dequant, eob, scan, iscan
-
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-
-DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-
-  neg                        ncoeffq
-  pxor                            m7, m7
-
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], ymm7
-  mova       [dqcoeffq+ncoeffq*4+32], ymm7
-  mova        [qcoeffq+ncoeffq*4+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*4+32], ymm7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], ymm7
-  mova        [qcoeffq+ncoeffq*2+ 0], ymm7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-
-  mov                         [eobq], word 0
-
-  vzeroupper
-  RET
-%endmacro
-
-INIT_XMM avx
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
-
-END
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index 32721beb3..c020b398c 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -8,12 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <emmintrin.h>
 #include <xmmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          int skip_block, const int16_t *zbin_ptr,
@@ -22,202 +24,103 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan_ptr,
                          const int16_t *iscan_ptr) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
   (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
 
-  coeff_ptr += n_coeffs;
-  iscan_ptr += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-  if (!skip_block) {
-    __m128i eob;
-    __m128i zbin;
-    __m128i round, quant, dequant, shift;
-    {
-      __m128i coeff0, coeff1;
-
-      // Setup global values
-      {
-        __m128i pw_1;
-        zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-        round = _mm_load_si128((const __m128i *)round_ptr);
-        quant = _mm_load_si128((const __m128i *)quant_ptr);
-        pw_1 = _mm_set1_epi16(1);
-        zbin = _mm_sub_epi16(zbin, pw_1);
-        dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-        shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-      }
-
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-        // Do DC and first 15 AC
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        round = _mm_unpackhi_epi64(round, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        quant = _mm_unpackhi_epi64(quant, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        shift = _mm_unpackhi_epi64(shift, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        dequant = _mm_unpackhi_epi64(dequant, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob = _mm_max_epi16(eob, eob1);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // AC only loop
-    while (n_coeffs < 0) {
-      __m128i coeff0, coeff1;
-      {
-        __m128i coeff0_sign, coeff1_sign;
-        __m128i qcoeff0, qcoeff1;
-        __m128i qtmp0, qtmp1;
-        __m128i cmp_mask0, cmp_mask1;
-
-        coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-        coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-        // Poor man's sign extract
-        coeff0_sign = _mm_srai_epi16(coeff0, 15);
-        coeff1_sign = _mm_srai_epi16(coeff1, 15);
-        qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
-        cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-        qtmp0 = _mm_add_epi16(qtmp0, qcoeff0);
-        qtmp1 = _mm_add_epi16(qtmp1, qcoeff1);
-        qcoeff0 = _mm_mulhi_epi16(qtmp0, shift);
-        qcoeff1 = _mm_mulhi_epi16(qtmp1, shift);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qcoeff1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        // Mask out zbin threshold coeffs
-        qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
-        qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      }
-
-      {
-        // Scan for eob
-        __m128i zero_coeff0, zero_coeff1;
-        __m128i nzero_coeff0, nzero_coeff1;
-        __m128i iscan0, iscan1;
-        __m128i eob0, eob1;
-        zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-        zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-        nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-        nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-        iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs));
-        iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1);
-        // Add one to convert from indices to counts
-        iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-        iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-        eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-        eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-        eob0 = _mm_max_epi16(eob0, eob1);
-        eob = _mm_max_epi16(eob, eob0);
-      }
-      n_coeffs += 8 * 2;
-    }
-
-    // Accumulate EOB
-    {
-      __m128i eob_shuffled;
-      eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-      eob = _mm_max_epi16(eob, eob_shuffled);
-      *eob_ptr = _mm_extract_epi16(eob, 1);
-    }
-  } else {
-    do {
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs);
-      store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs);
-      store_tran_low(zero, qcoeff_ptr + n_coeffs + 8);
-      n_coeffs += 8 * 2;
-    } while (n_coeffs < 0);
-    *eob_ptr = 0;
+  store_tran_low(coeff0, dqcoeff_ptr);
+  store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
new file mode 100644
index 000000000..3f528e1a9
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,292 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block, const int16_t *zbin_ptr,
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan_ptr;
+  (void)skip_block;
+  assert(!skip_block);
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_tran_low(coeff0, dqcoeff_ptr);
+  store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_ssse3(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
+    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan_ptr;
+  (void)n_coeffs;
+  (void)skip_block;
+  assert(!skip_block);
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  shift = _mm_slli_epi16(shift, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+#endif  // CONFIG_HIGHBITDEPTH
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    // Un-sign to bias rounding like C.
+    // dequant is almost always negative, so this is probably the backwards way
+    // to handle the sign. However, it matches the previous assembly.
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    // "Divide" by 2.
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr);
+    store_tran_low(coeff1, dqcoeff_ptr + 8);
+
+    eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0,
+                       zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+#if CONFIG_VP9_HIGHBITDEPTH
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = _mm_abs_epi16(qcoeff0);
+    coeff1 = _mm_abs_epi16(qcoeff1);
+
+    coeff0 = calculate_dqcoeff(coeff0, dequant);
+    coeff1 = calculate_dqcoeff(coeff1, dequant);
+
+    coeff0 = _mm_srli_epi16(coeff0, 1);
+    coeff1 = _mm_srli_epi16(coeff1, 1);
+
+    coeff0 = _mm_sign_epi16(coeff0, qcoeff0);
+    coeff1 = _mm_sign_epi16(coeff1, qcoeff1);
+
+    store_tran_low(coeff0, dqcoeff_ptr + index);
+    store_tran_low(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index ec2cafb94..000000000
--- a/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,345 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-; TODO(yunqingwang)fix quantize_b code for skip=1 case.
-%macro QUANTIZE_FN 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
-                                shift, qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-  cmp                    dword skipm, 0
-  jne .blank
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
-  movifnidn                    zbinq, zbinmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m0, [zbinq]              ; m0 = zbin
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-%ifidn %1, b_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m0, m5
-  paddw                           m1, m5
-  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  psubw                           m0, [pw_1]
-  mov                             r2, shiftmp
-  mov                             r3, qcoeffmp
-  mova                            m4, [r2]                 ; m4 = shift
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, b_32x32
-  psllw                           m4, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                         coeffq, [  coeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-%else
-  lea                         coeffq, [  coeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-%endif
-  lea                         iscanq, [  iscanq+ncoeffq*2]
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; coeff stored as 32bit numbers & require 16bit numbers
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  punpckhqdq                      m0, m0
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                           m8, m6                   ; m8 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
-  punpckhqdq                      m4, m4
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                            m8, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m8
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                            m11, m8
-  mova                            m6, m8
-  pcmpgtw                         m5, m8
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m8
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; pack coeff from 32bit to 16bit array
-  mova                            m9, [  coeffq+ncoeffq*4+ 0]
-  packssdw                        m9, [  coeffq+ncoeffq*4+16]
-  mova                           m10, [  coeffq+ncoeffq*4+32]
-  packssdw                       m10, [  coeffq+ncoeffq*4+48]
-%else
-  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
-  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
-%endif
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
-  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
-%ifidn %1, b_32x32
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-  or                              r6, r2
-  jz .skip_iter
-%endif
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  paddw                          m14, m6                   ; m14 += m6
-  paddw                          m13, m11                  ; m13 += m11
-  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
-  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  pand                           m14, m7
-  pand                           m13, m12
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+ 0], m11
-  mova        [qcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova        [qcoeffq+ncoeffq*4+32], m11
-  mova        [qcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5             ; reset m5 to zero register
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m14
-  mova        [qcoeffq+ncoeffq*2+16], m13
-%endif
-%ifidn %1, b_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
-  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
-%ifidn %1, b_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-%if CONFIG_VP9_HIGHBITDEPTH
-  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
-  mova                           m11, m14
-  mova                            m6, m14
-  pcmpgtw                         m5, m14
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m11
-  mova       [dqcoeffq+ncoeffq*4+16], m6
-  pxor                            m5, m5
-  mova                           m11, m13
-  mova                            m6, m13
-  pcmpgtw                         m5, m13
-  punpcklwd                      m11, m5
-  punpckhwd                       m6, m5
-  mova       [dqcoeffq+ncoeffq*4+32], m11
-  mova       [dqcoeffq+ncoeffq*4+48], m6
-  pxor                            m5, m5
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m14
-  mova       [dqcoeffq+ncoeffq*2+16], m13
-%endif
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m12                  ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-%ifidn %1, b_32x32
-  jmp .accumulate_eob
-.skip_iter:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova        [qcoeffq+ncoeffq*4+ 0], m5
-  mova        [qcoeffq+ncoeffq*4+16], m5
-  mova        [qcoeffq+ncoeffq*4+32], m5
-  mova        [qcoeffq+ncoeffq*4+48], m5
-  mova       [dqcoeffq+ncoeffq*4+ 0], m5
-  mova       [dqcoeffq+ncoeffq*4+16], m5
-  mova       [dqcoeffq+ncoeffq*4+32], m5
-  mova       [dqcoeffq+ncoeffq*4+48], m5
-%else
-  mova        [qcoeffq+ncoeffq*2+ 0], m5
-  mova        [qcoeffq+ncoeffq*2+16], m5
-  mova       [dqcoeffq+ncoeffq*2+ 0], m5
-  mova       [dqcoeffq+ncoeffq*2+16], m5
-%endif
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-%endif
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                             [r2], r6
-  RET
-
-  ; skip-block, i.e. just write all zeroes
-.blank:
-  mov                             r0, dqcoeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, qcoeffmp
-  mov                             r3, eobmp
-  DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob
-%if CONFIG_VP9_HIGHBITDEPTH
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
-%else
-  lea                       dqcoeffq, [dqcoeffq+ncoeffq*2]
-  lea                        qcoeffq, [ qcoeffq+ncoeffq*2]
-%endif
-  neg                        ncoeffq
-  pxor                            m7, m7
-.blank_loop:
-%if CONFIG_VP9_HIGHBITDEPTH
-  mova       [dqcoeffq+ncoeffq*4+ 0], m7
-  mova       [dqcoeffq+ncoeffq*4+16], m7
-  mova       [dqcoeffq+ncoeffq*4+32], m7
-  mova       [dqcoeffq+ncoeffq*4+48], m7
-  mova        [qcoeffq+ncoeffq*4+ 0], m7
-  mova        [qcoeffq+ncoeffq*4+16], m7
-  mova        [qcoeffq+ncoeffq*4+32], m7
-  mova        [qcoeffq+ncoeffq*4+48], m7
-%else
-  mova       [dqcoeffq+ncoeffq*2+ 0], m7
-  mova       [dqcoeffq+ncoeffq*2+16], m7
-  mova        [qcoeffq+ncoeffq*2+ 0], m7
-  mova        [qcoeffq+ncoeffq*2+16], m7
-%endif
-  add                        ncoeffq, mmsize
-  jl .blank_loop
-  mov                    word [eobq], 0
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FN b, 7
-QUANTIZE_FN b_32x32, 7
diff --git a/libvpx/vpx_dsp/x86/quantize_x86.h b/libvpx/vpx_dsp/x86/quantize_x86.h
new file mode 100644
index 000000000..34928fbb5
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_x86.h
@@ -0,0 +1,78 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c
new file mode 100644
index 000000000..5f2ab6ea7
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX512
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+void vpx_sad64x64x4d_avx512(const uint8_t *src, int src_stride,
+                            const uint8_t *const ref[4], int ref_stride,
+                            uint32_t res[4]) {
+  __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  __m512i sum_mlow, sum_mhigh;
+  int i;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm512_set1_epi16(0);
+  sum_ref1 = _mm512_set1_epi16(0);
+  sum_ref2 = _mm512_set1_epi16(0);
+  sum_ref3 = _mm512_set1_epi16(0);
+  for (i = 0; i < 64; i++) {
+    // load src and all refs
+    src_reg = _mm512_loadu_si512((const __m512i *)src);
+    ref0_reg = _mm512_loadu_si512((const __m512i *)ref0);
+    ref1_reg = _mm512_loadu_si512((const __m512i *)ref1);
+    ref2_reg = _mm512_loadu_si512((const __m512i *)ref2);
+    ref3_reg = _mm512_loadu_si512((const __m512i *)ref3);
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm512_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm512_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm512_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm512_sad_epu8(ref3_reg, src_reg);
+    // sum every ref-i
+    sum_ref0 = _mm512_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm512_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm512_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm512_add_epi32(sum_ref3, ref3_reg);
+
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+  {
+    __m256i sum256;
+    __m128i sum128;
+    // in sum_ref-i the result is saved in the first 4 bytes
+    // the other 4 bytes are zeroed.
+    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
+    sum_ref1 = _mm512_bslli_epi128(sum_ref1, 4);
+    sum_ref3 = _mm512_bslli_epi128(sum_ref3, 4);
+
+    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+    sum_ref0 = _mm512_or_si512(sum_ref0, sum_ref1);
+    sum_ref2 = _mm512_or_si512(sum_ref2, sum_ref3);
+
+    // merge every 64 bit from each sum_ref-i
+    sum_mlow = _mm512_unpacklo_epi64(sum_ref0, sum_ref2);
+    sum_mhigh = _mm512_unpackhi_epi64(sum_ref0, sum_ref2);
+
+    // add the low 64 bit to the high 64 bit
+    sum_mlow = _mm512_add_epi32(sum_mlow, sum_mhigh);
+
+    // add the low 128 bit to the high 128 bit
+    sum256 = _mm256_add_epi32(_mm512_castsi512_si256(sum_mlow),
+                              _mm512_extracti32x8_epi32(sum_mlow, 1));
+    sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
+                           _mm256_extractf128_si256(sum256, 1));
+
+    _mm_storeu_si128((__m128i *)(res), sum128);
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
index 18279bdb9..175dcc089 100644
--- a/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse3.asm
@@ -165,6 +165,8 @@
         paddw           mm7,       mm3
 %endmacro
 
+SECTION .text
+
 ;void int vpx_sad16x16x3_sse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
index bc6744797..03999dfca 100644
--- a/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ b/libvpx/vpx_dsp/x86/sad_sse4.asm
@@ -165,6 +165,8 @@
     movdqa          [rdi + 16],    xmm2
 %endmacro
 
+SECTION .text
+
 ;void vpx_sad16x16x8_sse4_1(
 ;    const unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
index 49f204fa0..7cf93cf51 100644
--- a/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/sad_ssse3.asm
@@ -146,6 +146,8 @@
 
 %endmacro
 
+SECTION .text
+
 ;void int vpx_sad16x16x3_ssse3(
 ;    unsigned char *src_ptr,
 ;    int  src_stride,
diff --git a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
index 6d58321e0..300fa8aab 100644
--- a/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
+++ b/libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm
@@ -44,6 +44,9 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
+
+SECTION .text
+
 ;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
diff --git a/libvpx/vpx_dsp/x86/transpose_sse2.h b/libvpx/vpx_dsp/x86/transpose_sse2.h
index a5e40245a..8a0119ca7 100644
--- a/libvpx/vpx_dsp/x86/transpose_sse2.h
+++ b/libvpx/vpx_dsp/x86/transpose_sse2.h
@@ -11,45 +11,357 @@
 #ifndef VPX_DSP_X86_TRANSPOSE_SSE2_H_
 #define VPX_DSP_X86_TRANSPOSE_SSE2_H_
 
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include <emmintrin.h>  // SSE2
 
-static INLINE void transpose_16bit_4x4(__m128i *res) {
-  const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
+#include "./vpx_config.h"
 
-  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 02 12 22 32  03 13 23 33
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_unpackhi_epi32(a0, a1);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
 }
 
-static INLINE void transpose_32bit_4x4(__m128i *const a0, __m128i *const a1,
-                                       __m128i *const a2, __m128i *const a3) {
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
   // Unpack 32 bit elements. Goes from:
-  // a0: 00 01 02 03
-  // a1: 10 11 12 13
-  // a2: 20 21 22 23
-  // a3: 30 31 32 33
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
   // to:
-  // b0: 00 10 01 11
-  // b1: 20 30 21 31
-  // b2: 02 12 03 13
-  // b3: 22 32 23 33
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
 
-  const __m128i b0 = _mm_unpacklo_epi32(*a0, *a1);
-  const __m128i b1 = _mm_unpacklo_epi32(*a2, *a3);
-  const __m128i b2 = _mm_unpackhi_epi32(*a0, *a1);
-  const __m128i b3 = _mm_unpackhi_epi32(*a2, *a3);
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
 
   // Unpack 64 bit elements resulting in:
-  // a0: 00 10 20 30
-  // a1: 01 11 21 31
-  // a2: 02 12 22 32
-  // a3: 03 13 23 33
-  *a0 = _mm_unpacklo_epi64(b0, b1);
-  *a1 = _mm_unpackhi_epi64(b0, b1);
-  *a2 = _mm_unpacklo_epi64(b2, b3);
-  *a3 = _mm_unpackhi_epi64(b2, b3);
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
 }
 
 #endif  // VPX_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/libvpx/vpx_dsp/x86/txfm_common_sse2.h b/libvpx/vpx_dsp/x86/txfm_common_sse2.h
index f8edb1b78..0a9542c85 100644
--- a/libvpx/vpx_dsp/x86/txfm_common_sse2.h
+++ b/libvpx/vpx_dsp/x86/txfm_common_sse2.h
@@ -18,6 +18,9 @@
   _mm_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
                 (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
 
+#define pair_set_epi32(a, b) \
+  _mm_set_epi32((int)(b), (int)(a), (int)(b), (int)(a))
+
 #define dual_set_epi16(a, b)                                            \
   _mm_set_epi16((int16_t)(b), (int16_t)(b), (int16_t)(b), (int16_t)(b), \
                 (int16_t)(a), (int16_t)(a), (int16_t)(a), (int16_t)(a))
diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c
index 8428e0520..d15a89c74 100644
--- a/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -7,16 +7,592 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <immintrin.h>  // AVX2
+
 #include "./vpx_dsp_rtcd.h"
 
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
+};
+
+DECLARE_ALIGNED(32, static const int8_t, adjacent_sub_avx2[32]) = {
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,
+  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1,  1, -1
+};
+/* clang-format on */
+
+void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
+                          const unsigned char *ref_ptr, int recon_stride,
+                          unsigned int *sse, int *sum) {
+  unsigned int i, src_2strides, ref_2strides;
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+  // process two 16 byte locations in a 256 bit register
+  src_2strides = source_stride << 1;
+  ref_2strides = recon_stride << 1;
+  for (i = 0; i < 8; ++i) {
+    // convert up values in 128 bit registers across lanes
+    const __m256i src0 =
+        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(src_ptr)));
+    const __m256i src1 = _mm256_cvtepu8_epi16(
+        _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)));
+    const __m256i ref0 =
+        _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i const *)(ref_ptr)));
+    const __m256i ref1 = _mm256_cvtepu8_epi16(
+        _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)));
+    const __m256i diff0 = _mm256_sub_epi16(src0, ref0);
+    const __m256i diff1 = _mm256_sub_epi16(src1, ref1);
+    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+    // add to the running totals
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+
+    src_ptr += src_2strides;
+    ref_ptr += ref_2strides;
+  }
+  {
+    // extract the low lane and add it to the high lane
+    const __m128i sum_reg_128 = _mm_add_epi16(
+        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+    const __m128i sse_reg_128 = _mm_add_epi32(
+        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+    // sum upper and lower 64 bits together and convert up to 32 bit values
+    const __m128i sum_reg_64 =
+        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+    // unpack sse and sum registers and add
+    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+    // perform the final summation and extract the results
+    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+    *((int *)sse) = _mm_cvtsi128_si32(res);
+    *((int *)sum) = _mm_extract_epi32(res, 1);
+  }
+}
+
+static void get32x16var_avx2(const unsigned char *src_ptr, int source_stride,
+                             const unsigned char *ref_ptr, int recon_stride,
+                             unsigned int *sse, int *sum) {
+  unsigned int i, src_2strides, ref_2strides;
+  const __m256i adj_sub = _mm256_load_si256((__m256i const *)adjacent_sub_avx2);
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+
+  // process 64 elements in an iteration
+  src_2strides = source_stride << 1;
+  ref_2strides = recon_stride << 1;
+  for (i = 0; i < 8; i++) {
+    const __m256i src0 = _mm256_loadu_si256((__m256i const *)(src_ptr));
+    const __m256i src1 =
+        _mm256_loadu_si256((__m256i const *)(src_ptr + source_stride));
+    const __m256i ref0 = _mm256_loadu_si256((__m256i const *)(ref_ptr));
+    const __m256i ref1 =
+        _mm256_loadu_si256((__m256i const *)(ref_ptr + recon_stride));
+
+    // unpack into pairs of source and reference values
+    const __m256i src_ref0 = _mm256_unpacklo_epi8(src0, ref0);
+    const __m256i src_ref1 = _mm256_unpackhi_epi8(src0, ref0);
+    const __m256i src_ref2 = _mm256_unpacklo_epi8(src1, ref1);
+    const __m256i src_ref3 = _mm256_unpackhi_epi8(src1, ref1);
+
+    // subtract adjacent elements using src*1 + ref*-1
+    const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+    const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+    const __m256i diff2 = _mm256_maddubs_epi16(src_ref2, adj_sub);
+    const __m256i diff3 = _mm256_maddubs_epi16(src_ref3, adj_sub);
+    const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+    const __m256i madd2 = _mm256_madd_epi16(diff2, diff2);
+    const __m256i madd3 = _mm256_madd_epi16(diff3, diff3);
+
+    // add to the running totals
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff0, diff1));
+    sum_reg = _mm256_add_epi16(sum_reg, _mm256_add_epi16(diff2, diff3));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd0, madd1));
+    sse_reg = _mm256_add_epi32(sse_reg, _mm256_add_epi32(madd2, madd3));
+
+    src_ptr += src_2strides;
+    ref_ptr += ref_2strides;
+  }
+
+  {
+    // extract the low lane and add it to the high lane
+    const __m128i sum_reg_128 = _mm_add_epi16(
+        _mm256_castsi256_si128(sum_reg), _mm256_extractf128_si256(sum_reg, 1));
+    const __m128i sse_reg_128 = _mm_add_epi32(
+        _mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1));
+
+    // sum upper and lower 64 bits together and convert up to 32 bit values
+    const __m128i sum_reg_64 =
+        _mm_add_epi16(sum_reg_128, _mm_srli_si128(sum_reg_128, 8));
+    const __m128i sum_int32 = _mm_cvtepi16_epi32(sum_reg_64);
+
+    // unpack sse and sum registers and add
+    const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, sum_int32);
+    const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+    // perform the final summation and extract the results
+    const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+    *((int *)sse) = _mm_cvtsi128_si32(res);
+    *((int *)sum) = _mm_extract_epi32(res, 1);
+  }
+}
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_lo);      \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  *sum_reg = _mm256_add_epi16(*sum_reg, exp_src_hi);      \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_lo);      \
+  *sse_reg = _mm256_add_epi32(*sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+static INLINE void spv32_x0_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_reg = _mm256_loadu_si256((__m256i const *)src);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(src_reg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// (x == 0, y == 4) or (x == 4, y == 0).  sstep determines the direction.
+static INLINE void spv32_half_zero(const uint8_t *src, int src_stride,
+                                   const uint8_t *dst, int dst_stride,
+                                   const uint8_t *sec, int sec_stride,
+                                   int do_sec, int height, __m256i *sum_reg,
+                                   __m256i *sse_reg, int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_avg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, sum_reg, sse_reg, src_stride);
+}
+
+static INLINE void spv32_x4_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  spv32_half_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, sum_reg, sse_reg, 1);
+}
+
+static INLINE void spv32_x4_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)(src));
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    const __m256i current_avg = _mm256_avg_epu8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_reg = _mm256_avg_epu8(current_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(current_avg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(current_avg, zero_reg);
+    }
+    // save current source average
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+// (x == 0, y == bil) or (x == 4, y == bil).  sstep determines the direction.
+static INLINE void spv32_bilin_zero(const uint8_t *src, int src_stride,
+                                    const uint8_t *dst, int dst_stride,
+                                    const uint8_t *sec, int sec_stride,
+                                    int do_sec, int height, __m256i *sum_reg,
+                                    __m256i *sse_reg, int offset, int sstep) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (offset << 5)));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + sstep));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      sec += sec_stride;
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static INLINE void spv32_x0_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int y_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                   height, sum_reg, sse_reg, y_offset, src_stride);
+}
+
+static INLINE void spv32_xb_y0(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset) {
+  spv32_bilin_zero(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                   height, sum_reg, sse_reg, x_offset, 1);
+}
+
+static INLINE void spv32_x4_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i prev_src_avg = _mm256_avg_epu8(src_a, src_b);
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  int i;
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    const __m256i src_avg = _mm256_avg_epu8(src_0, src_1);
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_avg, src_avg);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_avg, src_avg);
+    prev_src_avg = src_avg;
+
+    FILTER_SRC(filter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src_avg, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_y4(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i filter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i src_reg, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(filter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+  src += src_stride;
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(filter)
+
+    src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+    // average between previous pack to the current
+    src_pack = _mm256_avg_epu8(src_pack, src_reg);
+
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i avg_pack = _mm256_avg_epu8(src_pack, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_pack, zero_reg);
+      sec += sec_stride;
+    } else {
+      exp_src_lo = _mm256_unpacklo_epi8(src_pack, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(src_pack, zero_reg);
+    }
+    CALC_SUM_SSE_INSIDE_LOOP
+    src_pack = src_reg;
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE void spv32_xb_yb(const uint8_t *src, int src_stride,
+                               const uint8_t *dst, int dst_stride,
+                               const uint8_t *sec, int sec_stride, int do_sec,
+                               int height, __m256i *sum_reg, __m256i *sse_reg,
+                               int x_offset, int y_offset) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i pw8 = _mm256_set1_epi16(8);
+  const __m256i xfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (x_offset << 5)));
+  const __m256i yfilter = _mm256_load_si256(
+      (__m256i const *)(bilinear_filters_avx2 + (y_offset << 5)));
+  const __m256i src_a = _mm256_loadu_si256((__m256i const *)src);
+  const __m256i src_b = _mm256_loadu_si256((__m256i const *)(src + 1));
+  __m256i exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i prev_src_pack, src_pack;
+  int i;
+  exp_src_lo = _mm256_unpacklo_epi8(src_a, src_b);
+  exp_src_hi = _mm256_unpackhi_epi8(src_a, src_b);
+  FILTER_SRC(xfilter)
+  // convert each 16 bit to 8 bit to each low and high lane source
+  prev_src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+  src += src_stride;
+
+  for (i = 0; i < height; i++) {
+    const __m256i dst_reg = _mm256_loadu_si256((__m256i const *)dst);
+    const __m256i src_0 = _mm256_loadu_si256((__m256i const *)src);
+    const __m256i src_1 = _mm256_loadu_si256((__m256i const *)(src + 1));
+    exp_src_lo = _mm256_unpacklo_epi8(src_0, src_1);
+    exp_src_hi = _mm256_unpackhi_epi8(src_0, src_1);
+
+    FILTER_SRC(xfilter)
+    src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+
+    // merge previous pack to current pack source
+    exp_src_lo = _mm256_unpacklo_epi8(prev_src_pack, src_pack);
+    exp_src_hi = _mm256_unpackhi_epi8(prev_src_pack, src_pack);
+
+    FILTER_SRC(yfilter)
+    if (do_sec) {
+      const __m256i sec_reg = _mm256_loadu_si256((__m256i const *)sec);
+      const __m256i exp_src = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      const __m256i avg_reg = _mm256_avg_epu8(exp_src, sec_reg);
+      exp_src_lo = _mm256_unpacklo_epi8(avg_reg, zero_reg);
+      exp_src_hi = _mm256_unpackhi_epi8(avg_reg, zero_reg);
+      sec += sec_stride;
+    }
+
+    prev_src_pack = src_pack;
+
+    CALC_SUM_SSE_INSIDE_LOOP
+    dst += dst_stride;
+    src += src_stride;
+  }
+}
+
+static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
+                                  int x_offset, int y_offset,
+                                  const uint8_t *dst, int dst_stride,
+                                  const uint8_t *sec, int sec_stride,
+                                  int do_sec, int height, unsigned int *sse) {
+  const __m256i zero_reg = _mm256_setzero_si256();
+  __m256i sum_reg = _mm256_setzero_si256();
+  __m256i sse_reg = _mm256_setzero_si256();
+  __m256i sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  int sum;
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      spv32_x0_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x0_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      spv32_x0_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      spv32_x4_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_x4_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg);
+      // x_offset = 4  and y_offset = bilin interpolation
+    } else {
+      spv32_x4_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, y_offset);
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      spv32_xb_y0(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      spv32_xb_y4(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset);
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      spv32_xb_yb(src, src_stride, dst, dst_stride, sec, sec_stride, do_sec,
+                  height, &sum_reg, &sse_reg, x_offset, y_offset);
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+static unsigned int sub_pixel_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         NULL, 0, 0, height, sse);
+}
+
+static unsigned int sub_pixel_avg_variance32xh_avx2(
+    const uint8_t *src, int src_stride, int x_offset, int y_offset,
+    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
+    int height, unsigned int *sse) {
+  return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
+                         sec, sec_stride, 1, height, sse);
+}
+
 typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
                              const uint8_t *ref, int ref_stride,
                              unsigned int *sse, int *sum);
 
-void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride, unsigned int *sse,
-                          int *sum);
-
 static void variance_avx2(const uint8_t *src, int src_stride,
                           const uint8_t *ref, int ref_stride, int w, int h,
                           unsigned int *sse, int *sum, get_var_avx2 var_fn,
@@ -44,7 +620,7 @@ unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
                 vpx_get16x16var_avx2, 16);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
 unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
@@ -60,7 +636,7 @@ unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 9);
 }
 
@@ -69,7 +645,7 @@ unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 32, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 10);
 }
 
@@ -78,7 +654,7 @@ unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 64, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 12);
 }
 
@@ -87,32 +663,22 @@ unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum,
-                vpx_get32x32var_avx2, 32);
+                get32x16var_avx2, 32);
   return *sse - (uint32_t)(((int64_t)sum * sum) >> 11);
 }
 
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse);
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sseptr);
-
 unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                               int src_stride, int x_offset,
                                               int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_variance32xh_avx2(
+  const int se1 = sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 64, &sse1);
   unsigned int sse2;
   const int se2 =
-      vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
-                                      dst + 32, dst_stride, 64, &sse2);
+      sub_pixel_variance32xh_avx2(src + 32, src_stride, x_offset, y_offset,
+                                  dst + 32, dst_stride, 64, &sse2);
   const int se = se1 + se2;
   *sse = sse1 + sse2;
   return *sse - (uint32_t)(((int64_t)se * se) >> 12);
@@ -123,7 +689,7 @@ unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
                                               int y_offset, const uint8_t *dst,
                                               int dst_stride,
                                               unsigned int *sse) {
-  const int se = vpx_sub_pixel_variance32xh_avx2(
+  const int se = sub_pixel_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
@@ -132,10 +698,10 @@ unsigned int vpx_sub_pixel_avg_variance64x64_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   unsigned int sse1;
-  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se1 = sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 64, 64, &sse1);
   unsigned int sse2;
-  const int se2 = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se2 = sub_pixel_avg_variance32xh_avx2(
       src + 32, src_stride, x_offset, y_offset, dst + 32, dst_stride, sec + 32,
       64, 64, &sse2);
   const int se = se1 + se2;
@@ -149,7 +715,7 @@ unsigned int vpx_sub_pixel_avg_variance32x32_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, unsigned int *sse, const uint8_t *sec) {
   // Process 32 elements in parallel.
-  const int se = vpx_sub_pixel_avg_variance32xh_avx2(
+  const int se = sub_pixel_avg_variance32xh_avx2(
       src, src_stride, x_offset, y_offset, dst, dst_stride, sec, 32, 32, sse);
   return *sse - (uint32_t)(((int64_t)se * se) >> 10);
 }
diff --git a/libvpx/vpx_dsp/x86/variance_impl_avx2.c b/libvpx/vpx_dsp/x86/variance_impl_avx2.c
deleted file mode 100644
index 51e6b19ad..000000000
--- a/libvpx/vpx_dsp/x86/variance_impl_avx2.c
+++ /dev/null
@@ -1,708 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-
-/* clang-format off */
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,  16, 0,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10, 6,  10,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12, 4,  12,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-  2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14, 2,  14,
-};
-/* clang-format on */
-
-void vpx_get16x16var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i, src_2strides, ref_2strides;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing two strides in a 256 bit register reducing the number
-  // of loop stride by half (comparing to the sse2 code)
-  src_2strides = source_stride << 1;
-  ref_2strides = recon_stride << 1;
-  for (i = 0; i < 8; i++) {
-    src = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(src_ptr)));
-    src = _mm256_inserti128_si256(
-        src, _mm_loadu_si128((__m128i const *)(src_ptr + source_stride)), 1);
-
-    ref = _mm256_castsi128_si256(_mm_loadu_si128((__m128i const *)(ref_ptr)));
-    ref = _mm256_inserti128_si256(
-        ref, _mm_loadu_si128((__m128i const *)(ref_ptr + recon_stride)), 1);
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += src_2strides;
-    ref_ptr += ref_2strides;
-  }
-
-  {
-    __m128i sum_res, madd_res;
-    __m128i expand_sum_low, expand_sum_high, expand_sum;
-    __m128i expand_madd_low, expand_madd_high, expand_madd;
-    __m128i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // extract the low lane and add it to the high lane
-    sum_res = _mm_add_epi16(_mm256_castsi256_si128(sum_ref_src),
-                            _mm256_extractf128_si256(sum_ref_src, 1));
-
-    madd_res = _mm_add_epi32(_mm256_castsi256_si128(madd_ref_src),
-                             _mm256_extractf128_si256(madd_ref_src, 1));
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low =
-        _mm_unpacklo_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-    expand_sum_high =
-        _mm_unpackhi_epi16(_mm256_castsi256_si128(zero_reg), sum_res);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low =
-        _mm_unpacklo_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-    expand_madd_high =
-        _mm_unpackhi_epi32(madd_res, _mm256_castsi256_si128(zero_reg));
-
-    expand_madd = _mm_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low =
-        _mm_unpacklo_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-    ex_expand_sum_high =
-        _mm_unpackhi_epi32(expand_sum, _mm256_castsi256_si128(zero_reg));
-
-    ex_expand_sum = _mm_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_res = _mm_srli_si128(expand_madd, 8);
-    sum_res = _mm_srli_si128(ex_expand_sum, 8);
-
-    madd_res = _mm_add_epi32(madd_res, expand_madd);
-    sum_res = _mm_add_epi32(sum_res, ex_expand_sum);
-
-    *((int *)SSE) = _mm_cvtsi128_si32(madd_res);
-
-    *((int *)Sum) = _mm_cvtsi128_si32(sum_res);
-  }
-}
-
-void vpx_get32x32var_avx2(const unsigned char *src_ptr, int source_stride,
-                          const unsigned char *ref_ptr, int recon_stride,
-                          unsigned int *SSE, int *Sum) {
-  __m256i src, src_expand_low, src_expand_high, ref, ref_expand_low;
-  __m256i ref_expand_high, madd_low, madd_high;
-  unsigned int i;
-  __m256i zero_reg = _mm256_set1_epi16(0);
-  __m256i sum_ref_src = _mm256_set1_epi16(0);
-  __m256i madd_ref_src = _mm256_set1_epi16(0);
-
-  // processing 32 elements in parallel
-  for (i = 0; i < 16; i++) {
-    src = _mm256_loadu_si256((__m256i const *)(src_ptr));
-
-    ref = _mm256_loadu_si256((__m256i const *)(ref_ptr));
-
-    // expanding to 16 bit each lane
-    src_expand_low = _mm256_unpacklo_epi8(src, zero_reg);
-    src_expand_high = _mm256_unpackhi_epi8(src, zero_reg);
-
-    ref_expand_low = _mm256_unpacklo_epi8(ref, zero_reg);
-    ref_expand_high = _mm256_unpackhi_epi8(ref, zero_reg);
-
-    // src-ref
-    src_expand_low = _mm256_sub_epi16(src_expand_low, ref_expand_low);
-    src_expand_high = _mm256_sub_epi16(src_expand_high, ref_expand_high);
-
-    // madd low (src - ref)
-    madd_low = _mm256_madd_epi16(src_expand_low, src_expand_low);
-
-    // add high to low
-    src_expand_low = _mm256_add_epi16(src_expand_low, src_expand_high);
-
-    // madd high (src - ref)
-    madd_high = _mm256_madd_epi16(src_expand_high, src_expand_high);
-
-    sum_ref_src = _mm256_add_epi16(sum_ref_src, src_expand_low);
-
-    // add high to low
-    madd_ref_src =
-        _mm256_add_epi32(madd_ref_src, _mm256_add_epi32(madd_low, madd_high));
-
-    src_ptr += source_stride;
-    ref_ptr += recon_stride;
-  }
-
-  {
-    __m256i expand_sum_low, expand_sum_high, expand_sum;
-    __m256i expand_madd_low, expand_madd_high, expand_madd;
-    __m256i ex_expand_sum_low, ex_expand_sum_high, ex_expand_sum;
-
-    // padding each 2 bytes with another 2 zeroed bytes
-    expand_sum_low = _mm256_unpacklo_epi16(zero_reg, sum_ref_src);
-    expand_sum_high = _mm256_unpackhi_epi16(zero_reg, sum_ref_src);
-
-    // shifting the sign 16 bits right
-    expand_sum_low = _mm256_srai_epi32(expand_sum_low, 16);
-    expand_sum_high = _mm256_srai_epi32(expand_sum_high, 16);
-
-    expand_sum = _mm256_add_epi32(expand_sum_low, expand_sum_high);
-
-    // expand each 32 bits of the madd result to 64 bits
-    expand_madd_low = _mm256_unpacklo_epi32(madd_ref_src, zero_reg);
-    expand_madd_high = _mm256_unpackhi_epi32(madd_ref_src, zero_reg);
-
-    expand_madd = _mm256_add_epi32(expand_madd_low, expand_madd_high);
-
-    ex_expand_sum_low = _mm256_unpacklo_epi32(expand_sum, zero_reg);
-    ex_expand_sum_high = _mm256_unpackhi_epi32(expand_sum, zero_reg);
-
-    ex_expand_sum = _mm256_add_epi32(ex_expand_sum_low, ex_expand_sum_high);
-
-    // shift 8 bytes eight
-    madd_ref_src = _mm256_srli_si256(expand_madd, 8);
-    sum_ref_src = _mm256_srli_si256(ex_expand_sum, 8);
-
-    madd_ref_src = _mm256_add_epi32(madd_ref_src, expand_madd);
-    sum_ref_src = _mm256_add_epi32(sum_ref_src, ex_expand_sum);
-
-    // extract the low lane and the high lane and add the results
-    *((int *)SSE) =
-        _mm_cvtsi128_si32(_mm256_castsi256_si128(madd_ref_src)) +
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(madd_ref_src, 1));
-
-    *((int *)Sum) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_ref_src)) +
-                    _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
-  }
-}
-
-#define FILTER_SRC(filter)                               \
-  /* filter the source */                                \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-                                                         \
-  /* add 8 to source */                                  \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
-                                                         \
-  /* divide source by 16 */                              \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg)               \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST                                    \
-  /* load source and destination */                     \
-  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  /* average between current and next stride source */                     \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
-  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP                          \
-  /* expand each byte to 2 bytes */                       \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
-  /* source - dest */                                     \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
-  /* caculate sum */                                      \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */                                     \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE                                                   \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
-                                                                           \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
-                                                                           \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
-                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height, unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
-
-unsigned int vpx_sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
-    int height, unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-    }
-    // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec += sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src += src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-    // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src += src_stride;
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec += sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-      // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256(
-          (__m256i const *)(bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *)(src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height; i++) {
-        src += src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec += sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst += dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index 1161da491..8d8bf183b 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -222,7 +222,7 @@ unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
                                     unsigned int *sse) {
   int sum;
   vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8);
+  return *sse - (uint32_t)(((int64_t)sum * sum) >> 8);
 }
 
 unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
diff --git a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
index 727d9d115..4f164afeb 100644
--- a/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
+++ b/libvpx/vpx_dsp/x86/vpx_asm_stubs.c
@@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2;
 
 // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                  uint8_t *dst, ptrdiff_t dst_stride,
-//                                  const int16_t *filter_x, int x_step_q4,
-//                                  const int16_t *filter_y, int y_step_q4,
+//                                  const InterpKernel *filter, int x0_q4,
+//                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2);
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
-//                         const int16_t *filter_x, int x_step_q4,
-//                         const int16_t *filter_y, int y_step_q4,
+//                         const InterpKernel *filter, int x0_q4,
+//                         int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                         int w, int h);
 // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                             uint8_t *dst, ptrdiff_t dst_stride,
-//                             const int16_t *filter_x, int x_step_q4,
-//                             const int16_t *filter_y, int y_step_q4,
+//                             const InterpKernel *filter, int x0_q4,
+//                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
 FUN_CONV_2D(, sse2);
 FUN_CONV_2D(avg_, sse2);
@@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2;
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2);
-HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2);
-HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2);
-HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2);
+HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2);
+HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_,
                  sse2);
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h, int bd);
 // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h, int bd);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 HIGH_FUN_CONV_2D(avg_, sse2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64
diff --git a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index 389a692db..3f444e2e6 100644
--- a/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -20,14 +20,14 @@ SECTION .text
 %endif
 %ifidn %2, highbd
 %define pavg pavgw
-cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                               dst, dst_stride, \
-                                              fx, fxs, fy, fys, w, h, bd
+                                              f, fxo, fxs, fyo, fys, w, h, bd
 %else
 %define pavg pavgb
-cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \
+cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \
                                            dst, dst_stride, \
-                                           fx, fxs, fy, fys, w, h
+                                           f, fxo, fxs, fyo, fys, w, h
 %endif
   mov r4d, dword wm
 %ifidn %2, highbd
diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
index bfc816f23..d83507dc9 100644
--- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm
@@ -197,6 +197,8 @@
     movdqu      [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
index 72f2ff71d..9bffe504b 100644
--- a/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_high_subpixel_bilinear_sse2.asm
@@ -171,6 +171,8 @@
 %endm
 %endif
 
+SECTION .text
+
 global sym(vpx_highbd_filter_block1d4_v2_sse2) PRIVATE
 sym(vpx_highbd_filter_block1d4_v2_sse2):
     push        rbp
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 7c1ecc014..d0919695c 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -12,9 +12,10 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_avx2.h"
 #include "vpx_ports/mem.h"
 
-// filters for 16_h8 and 16_v8
+// filters for 16_h8
 DECLARE_ALIGNED(32, static const uint8_t, filt1_global_avx2[32]) = {
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
   0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
@@ -35,493 +36,296 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-#if defined(__clang__)
-#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
-    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
-    (defined(__APPLE__) && defined(__apple_build_version__) && \
-     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
-      (__clang_major__ == 5 && __clang_minor__ == 0)))
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#else  // clang > 3.3, and not 5.0 on macosx.
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // clang <= 3.3
-#elif defined(__GNUC__)
-#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
-#define MM256_BROADCASTSI128_SI256(x) \
-  _mm_broadcastsi128_si256((__m128i const *)&(x))
-#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
-#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
-#else  // gcc > 4.7
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // gcc <= 4.6
-#else   // !(gcc || clang)
-#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
-#endif  // __clang__
-
-static void vpx_filter_block1d16_h8_avx2(
+static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
-    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
-  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i outReg1, outReg2;
+  __m256i outReg32b1, outReg32b2;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to 8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
-
-  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
-  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
-  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
-  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+  shuffle_filter_avx2(filter, f);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pixels_per_line << 1;
   dst_stride = output_pitch << 1;
   for (i = output_height; i > 1; i -= 2) {
+    __m256i srcReg;
+
     // load the 2 strides of source
-    srcReg32b1 =
+    srcReg =
         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg32b1 = _mm256_inserti128_si256(
-        srcReg32b1,
+    srcReg = _mm256_inserti128_si256(
+        srcReg,
         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
         1);
 
     // filter the source buffer
-    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
-
-    // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b1 = convolve8_16_avx2(s, f);
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg32b2 =
+    srcReg =
         _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg32b2 = _mm256_inserti128_si256(
-        srcReg32b2,
+    srcReg = _mm256_inserti128_si256(
+        srcReg,
         _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
         1);
 
-    // add and saturate the results together
-    srcRegFilt32b1_1 = _mm256_adds_epi16(
-        srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    // filter the source buffer
-    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
-
     // filter the source buffer
-    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
-    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
+    outReg32b2 = convolve8_16_avx2(s, f);
 
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
-    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
-    srcRegFilt32b2_1 = _mm256_adds_epi16(
-        srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));
-
-    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);
-
-    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
-    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg32b1 = _mm256_packus_epi16(outReg32b1, outReg32b2);
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr,
-                    _mm256_castsi256_si128(srcRegFilt32b1_1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch),
-                    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
+
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
-    __m128i srcRegFilt2, srcRegFilt3;
+    __m128i srcReg;
 
-    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    // load the first 16 bytes of the last row
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1_1 =
-        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
+    s[0] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+    s[1] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+    s[2] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+    s[3] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+    outReg1 = convolve8_8_avx2(s, f);
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
-
-    // add and saturate the results together
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // filter the source buffer
-    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt2_1 =
-        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 =
-        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
-    srcRegFilt2 =
-        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2));
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2));
-
-    srcRegFilt1_1 =
-        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64));
-
-    srcRegFilt2_1 =
-        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
-    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+    s[0] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
+    s[1] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
+    s[2] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
+    s[3] = _mm256_castsi128_si256(
+        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
+    outReg2 = convolve8_8_avx2(s, f);
+
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
-static void vpx_filter_block1d16_v8_avx2(
+static void vpx_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 0);
+}
+
+static void vpx_filter_block1d16_h8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *output_ptr,
+    ptrdiff_t dst_stride, uint32_t output_height, const int16_t *filter) {
+  vpx_filter_block1d16_h8_x_avx2(src_ptr, src_stride, output_ptr, dst_stride,
+                                 output_height, filter, 1);
+}
+
+static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
-    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i filtersReg;
-  __m256i addFilterReg64;
-  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
-  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
-  __m256i srcReg32b11, srcReg32b12, filtersReg32;
-  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
+    const int avg) {
+  __m128i outReg1, outReg2;
+  __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
+  __m256i f[4], s1[4], s2[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the
-  // same data in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-  // have the same data in both lanes of a 256 bit register
-  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 256 bit register
-  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 256 bit register
-  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 256 bit register
-  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 256 bit register
-  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+  shuffle_filter_avx2(filter, f);
 
   // multiple the size of the source and destination stride by two
   src_stride = src_pitch << 1;
   dst_stride = out_pitch << 1;
 
-  // load 16 bytes 7 times in stride of src_pitch
-  srcReg32b1 =
-      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr)));
-  srcReg32b2 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
-  srcReg32b3 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
-  srcReg32b4 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
-  srcReg32b5 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
-  srcReg32b6 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
-  srcReg32b7 = _mm256_castsi128_si256(
-      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
-
-  // have each consecutive loads on the same 256 register
-  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                                       _mm256_castsi256_si128(srcReg32b2), 1);
-  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                                       _mm256_castsi256_si128(srcReg32b3), 1);
-  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
-                                       _mm256_castsi256_si128(srcReg32b4), 1);
-  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
-                                       _mm256_castsi256_si128(srcReg32b5), 1);
-  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
-                                       _mm256_castsi256_si128(srcReg32b6), 1);
-  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
-                                       _mm256_castsi256_si128(srcReg32b7), 1);
-
-  // merge every two consecutive registers except the last one
-  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
-  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
-
-  // save
-  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
-
-  // save
-  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
-
-  // save
-  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+  {
+    __m128i s[6];
+    __m256i s32b[6];
+
+    // load 16 bytes 7 times in stride of src_pitch
+    s[0] = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_pitch));
+    s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_pitch));
+    s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_pitch));
+    s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + 3 * src_pitch));
+    s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + 5 * src_pitch));
+    srcRegHead1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 6 * src_pitch)));
+
+    // have each consecutive loads on the same 256 register
+    s32b[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+    s32b[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+    s32b[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+    s32b[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+    s32b[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+    s32b[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]),
+                                      _mm256_castsi256_si128(srcRegHead1), 1);
+
+    // merge every two consecutive registers except the last one
+    // the first lanes contain values for filtering odd rows (1,3,5...) and
+    // the second lanes contain values for filtering even rows (2,4,6...)
+    s1[0] = _mm256_unpacklo_epi8(s32b[0], s32b[1]);
+    s2[0] = _mm256_unpackhi_epi8(s32b[0], s32b[1]);
+    s1[1] = _mm256_unpacklo_epi8(s32b[2], s32b[3]);
+    s2[1] = _mm256_unpackhi_epi8(s32b[2], s32b[3]);
+    s1[2] = _mm256_unpacklo_epi8(s32b[4], s32b[5]);
+    s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
+  }
 
   for (i = output_height; i > 1; i -= 2) {
-    // load the last 2 loads of 16 bytes and have every two
+    __m256i srcRegHead2, srcRegHead3;
+
+    // load the next 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
-    srcReg32b8 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
-    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
-                                         _mm256_castsi256_si128(srcReg32b8), 1);
-    srcReg32b9 = _mm256_castsi128_si256(
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
-    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
-                                         _mm256_castsi256_si128(srcReg32b9), 1);
-
-    // merge every two consecutive registers
-    // save
-    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
-    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
-                                    _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
-    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
-
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
-    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
-
-    // add and saturate the results together
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
-                                   _mm256_max_epi16(srcReg32b8, srcReg32b12));
-
-    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
-    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
-    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+    srcRegHead2 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 7 * src_pitch)));
+    srcRegHead1 = _mm256_inserti128_si256(
+        srcRegHead1, _mm256_castsi256_si128(srcRegHead2), 1);
+    srcRegHead3 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + 8 * src_pitch)));
+    srcRegHead2 = _mm256_inserti128_si256(
+        srcRegHead2, _mm256_castsi256_si128(srcRegHead3), 1);
+
+    // merge the two new consecutive registers
+    // the first lane contain values for filtering odd rows (1,3,5...) and
+    // the second lane contain values for filtering even rows (2,4,6...)
+    s1[3] = _mm256_unpacklo_epi8(srcRegHead1, srcRegHead2);
+    s2[3] = _mm256_unpackhi_epi8(srcRegHead1, srcRegHead2);
+
+    s1[0] = convolve8_16_avx2(s1, f);
+    s2[0] = convolve8_16_avx2(s2, f);
+
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    s1[0] = _mm256_packus_epi16(s1[0], s2[0]);
 
     src_ptr += src_stride;
 
+    // average if necessary
+    outReg1 = _mm256_castsi256_si128(s1[0]);
+    outReg2 = _mm256_extractf128_si256(s1[0], 1);
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+      outReg2 = _mm_avg_epu8(
+          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+    }
+
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(srcReg32b1));
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
 
     // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch),
-                    _mm256_extractf128_si256(srcReg32b1, 1));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
 
     output_ptr += dst_stride;
 
-    // save part of the registers for next strides
-    srcReg32b10 = srcReg32b11;
-    srcReg32b1 = srcReg32b3;
-    srcReg32b11 = srcReg32b2;
-    srcReg32b3 = srcReg32b5;
-    srcReg32b2 = srcReg32b4;
-    srcReg32b5 = srcReg32b7;
-    srcReg32b7 = srcReg32b9;
+    // shift down by two rows
+    s1[0] = s1[1];
+    s2[0] = s2[1];
+    s1[1] = s1[2];
+    s2[1] = s2[2];
+    s1[2] = s1[3];
+    s2[2] = s2[3];
+    srcRegHead1 = srcRegHead3;
   }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
   if (i > 0) {
-    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
-    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+    const __m128i srcRegHead2 =
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the last 2 results together
-    srcRegFilt4 =
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-    srcRegFilt7 =
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt4 =
-        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
-    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
-                                    _mm256_castsi256_si128(firstFilters));
-    srcRegFilt7 =
-        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
-                                    _mm256_castsi256_si128(secondFilters));
-    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
-                                    _mm256_castsi256_si128(secondFilters));
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
-                                    _mm256_castsi256_si128(thirdFilters));
-    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
-                                    _mm256_castsi256_si128(thirdFilters));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7));
-
-    // add and saturate the results together
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7));
-
-    srcRegFilt1 =
-        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64));
-    srcRegFilt3 =
-        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64));
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
-    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+    s1[0] = _mm256_castsi128_si256(
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
+    s2[0] = _mm256_castsi128_si256(
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
+
+    outReg1 = convolve8_8_avx2(s1, f);
+    outReg2 = convolve8_8_avx2(s2, f);
+
+    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
+    // contain the first and second convolve result respectively
+    outReg1 = _mm_packus_epi16(outReg1, outReg2);
+
+    // average if necessary
+    if (avg) {
+      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
+    }
 
     // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+    _mm_store_si128((__m128i *)output_ptr, outReg1);
   }
 }
 
+static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_stride, uint8_t *dst_ptr,
+                                         ptrdiff_t dst_stride, uint32_t height,
+                                         const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 0);
+}
+
+static void vpx_filter_block1d16_v8_avg_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr,
+    ptrdiff_t dst_stride, uint32_t height, const int16_t *filter) {
+  vpx_filter_block1d16_v8_x_avx2(src_ptr, src_stride, dst_ptr, dst_stride,
+                                 height, filter, 1);
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
 filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if ARCH_X86_64
@@ -539,6 +343,14 @@ filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
 #define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
 #define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // ARCH_X86_64
+filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h8_avg_ssse3;
+#define vpx_filter_block1d8_v8_avg_avx2 vpx_filter_block1d8_v8_avg_ssse3
+#define vpx_filter_block1d8_h8_avg_avx2 vpx_filter_block1d8_h8_avg_ssse3
+#define vpx_filter_block1d4_v8_avg_avx2 vpx_filter_block1d4_v8_avg_ssse3
+#define vpx_filter_block1d4_h8_avg_avx2 vpx_filter_block1d4_h8_avg_ssse3
 filter8_1dfunction vpx_filter_block1d16_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d16_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
@@ -552,23 +364,53 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
 #define vpx_filter_block1d8_h2_avx2 vpx_filter_block1d8_h2_ssse3
 #define vpx_filter_block1d4_v2_avx2 vpx_filter_block1d4_v2_ssse3
 #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3
+filter8_1dfunction vpx_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
+#define vpx_filter_block1d16_v2_avg_avx2 vpx_filter_block1d16_v2_avg_ssse3
+#define vpx_filter_block1d16_h2_avg_avx2 vpx_filter_block1d16_h2_avg_ssse3
+#define vpx_filter_block1d8_v2_avg_avx2 vpx_filter_block1d8_v2_avg_ssse3
+#define vpx_filter_block1d8_h2_avg_avx2 vpx_filter_block1d8_h2_avg_ssse3
+#define vpx_filter_block1d4_v2_avg_avx2 vpx_filter_block1d4_v2_avg_ssse3
+#define vpx_filter_block1d4_h2_avg_avx2 vpx_filter_block1d4_h2_avg_ssse3
 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+// void vpx_convolve8_avg_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
+// void vpx_convolve8_avg_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2);
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
+// void vpx_convolve8_avg_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
+//                              int w, int h);
 FUN_CONV_2D(, avx2);
+FUN_CONV_2D(avg_, avx2);
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 09c75d455..e4f992780 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,52 +8,37 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <tmmintrin.h>
+#include <tmmintrin.h>  // SSSE3
+
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_dsp/x86/convolve.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#include "vpx_ports/emmintrin_compat.h"
-
-// filters only for the 4_h8 convolution
-DECLARE_ALIGNED(16, static const uint8_t, filt1_4_h8[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_4_h8[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-// filters for 8_h8 and 16_h8
-DECLARE_ALIGNED(16, static const uint8_t, filt1_global[16]) = {
-  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt2_global[16]) = {
-  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt3_global[16]) = {
-  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
-  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
-};
 
 // These are reused by the avx2 intrinsics.
-filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
+// vpx_filter_block1d8_v8_intrin_ssse3()
+// vpx_filter_block1d8_h8_intrin_ssse3()
+// vpx_filter_block1d4_h8_intrin_ssse3()
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
 
 void vpx_filter_block1d4_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, srcReg, minReg;
+  __m128i srcRegFilt1, srcRegFilt2;
+  __m128i addFilterReg64, filtersReg, srcReg;
   unsigned int i;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
@@ -75,8 +60,8 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
   secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);
 
   // loading the local filters
-  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
-  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
+  shuffle1 = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6);
+  shuffle2 = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10);
 
   for (i = 0; i < output_height; i++) {
     srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
@@ -89,25 +74,23 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
     srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
     srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
 
-    // extract the higher half of the lane
-    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
-    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);
+    // sum the results together, saturating only on the final step
+    // the specific order of the additions prevents outranges
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, srcRegFilt2);
 
-    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);
+    // extract the higher half of the register
+    srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8);
 
-    // add and saturate all the results together
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
+    // add the rounding offset early to avoid another saturated add
+    srcRegFilt1 = _mm_add_epi16(srcRegFilt1, addFilterReg64);
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
 
     // shift by 7 bit each 16 bits
     srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
 
     // shrink to 8 bit each 16 bits
     srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 4 bytes
     *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);
@@ -117,77 +100,35 @@ void vpx_filter_block1d4_h8_intrin_ssse3(
 }
 
 void vpx_filter_block1d8_h8_intrin_ssse3(
-    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
-  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
-  __m128i addFilterReg64, filtersReg, minReg;
   unsigned int i;
+  __m128i f[4], filt[4], s[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits (first and second byte)
-  // across 128 bit register
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits (third and forth byte)
-  // across 128 bit register
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits (fifth and sixth byte)
-  // across 128 bit register
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits (seventh and eighth byte)
-  // across 128 bit register
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
-
-  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
-  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
-  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
-  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
+  shuffle_filter_ssse3(filter, f);
+  filt[0] = _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+  filt[1] = _mm_setr_epi8(2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10);
+  filt[2] = _mm_setr_epi8(4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12);
+  filt[3] =
+      _mm_setr_epi8(6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14);
 
   for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
-    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
-    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);
-
-    // filter the source buffer
-    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
-    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
-    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);
-
-    // add and saturate all the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
-
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bits
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    s[0] = _mm_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm_shuffle_epi8(srcReg, filt[3]);
+    s[0] = convolve8_8_ssse3(s, f);
 
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    s[0] = _mm_packus_epi16(s[0], s[0]);
 
-    src_ptr += src_pixels_per_line;
+    src_ptr += src_pitch;
 
     // save only 8 bytes
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
 
     output_ptr += output_pitch;
   }
@@ -196,83 +137,49 @@ void vpx_filter_block1d8_h8_intrin_ssse3(
 void vpx_filter_block1d8_v8_intrin_ssse3(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
-  __m128i addFilterReg64, filtersReg, minReg;
-  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
-  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
-  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
-  __m128i srcReg8;
   unsigned int i;
+  __m128i f[4], s[8], ss[4];
 
-  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
-  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  // converting the 16 bit (short) to  8 bit (byte) and have the same data
-  // in both lanes of 128 bit register.
-  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
-
-  // duplicate only the first 16 bits in the filter
-  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
-  // duplicate only the second 16 bits in the filter
-  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
-  // duplicate only the third 16 bits in the filter
-  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
-  // duplicate only the forth 16 bits in the filter
-  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
+  shuffle_filter_ssse3(filter, f);
 
   // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
 
   for (i = 0; i < output_height; i++) {
     // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
 
     // merge the result together
-    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
-    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+    ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
 
     // merge the result together
-    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
-    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
-    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
-    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);
-
-    // add and saturate the results together
-    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
-    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
-    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);
-
-    // shift by 7 bit each 16 bit
-    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
+    ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
 
+    ss[0] = convolve8_8_ssse3(ss, f);
     // shrink to 8 bit each 16 bits
-    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
+    ss[0] = _mm_packus_epi16(ss[0], ss[0]);
 
     src_ptr += src_pitch;
 
     // shift down a row
-    srcReg1 = srcReg2;
-    srcReg2 = srcReg3;
-    srcReg3 = srcReg4;
-    srcReg4 = srcReg5;
-    srcReg5 = srcReg6;
-    srcReg6 = srcReg7;
-    srcReg7 = srcReg8;
+    s[0] = s[1];
+    s[1] = s[2];
+    s[2] = s[3];
+    s[3] = s[4];
+    s[4] = s[5];
+    s[5] = s[6];
+    s[6] = s[7];
 
     // save only 8 bytes convolve result
-    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);
+    _mm_storel_epi64((__m128i *)&output_ptr[0], ss[0]);
 
     output_ptr += out_pitch;
   }
@@ -306,149 +213,69 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3;
 
 // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
+//                                const InterpKernel *filter, int x0_q4,
+//                                int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                int w, int h);
 // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
+//                               const InterpKernel *filter, int x0_q4,
+//                               int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                               int w, int h);
 // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
+//                                    const InterpKernel *filter, int x0_q4,
+//                                    int32_t x_step_q4, int y0_q4,
+//                                    int y_step_q4, int w, int h);
 // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+//                                   const InterpKernel *filter, int x0_q4,
+//                                   int32_t x_step_q4, int y0_q4,
+//                                   int y_step_q4, int w, int h);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3);
+FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3);
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
+
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
 
-  // This function processes 8x8 areas.  The intermediate height is not always
+  // This function processes 8x8 areas. The intermediate height is not always
   // a multiple of 8, so force it to be a multiple of 8 here.
   y = h + (8 - (h & 0x7));
 
@@ -479,93 +306,50 @@ static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
   } while (y -= 8);
 }
 
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4], ss[2];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, ss);
+  // 00 01 10 11 20 21 30 31
+  s[0] = ss[0];
   // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
+  s[1] = _mm_srli_si128(ss[0], 8);
+  // 04 05 14 15 24 25 34 35
+  s[2] = ss[1];
   // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+  s[3] = _mm_srli_si128(ss[1], 8);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
   // shrink to 8 bit each 16 bits
   temp = _mm_packus_epi16(temp, temp);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
+
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
 }
 
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
   DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
   int x, y, z;
   src -= SUBPEL_TAPS / 2 - 1;
@@ -597,50 +381,41 @@ static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
   // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 4 bytes
   *(int *)dst = _mm_cvtsi128_si32(temp);
 }
 
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -659,50 +434,21 @@ static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
   // save only 8 bytes convolve result
   _mm_storel_epi64((__m128i *)dst, temp);
 }
 
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -719,81 +465,44 @@ static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
   int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
 
   for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+    loadu_8bit_16x8(src, src_stride, s);
+
     // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
     temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
+    src += 16;
     // save 16 bytes convolve result
     _mm_store_si128((__m128i *)&dst[i], temp_hi);
   }
 }
 
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
   int y;
   int y_q4 = y0_q4;
 
@@ -811,11 +520,10 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
+void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
   // Note: Fixed size intermediate buffer, temp, places limits on parameters.
   // 2d filtering proceeds in 2 steps:
   //   (1) Interpolate horizontally into an intermediate buffer, temp.
@@ -829,60 +537,49 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
   DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
   assert(w <= 64);
   assert(h <= 64);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
-                            w, intermediate_height);
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
   }
 
   if (w >= 16) {
     scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
     scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   } else {
     scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
   }
 }
 
-void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
 // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
+//                          const InterpKernel *filter, int x0_q4,
+//                          int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                          int w, int h);
 // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
+//                              const InterpKernel *filter, int x0_q4,
+//                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
 FUN_CONV_2D(avg_, ssse3);
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
index 08f3d6a6c..8497e1721 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_sse2.asm
@@ -176,6 +176,8 @@
     movq        [rdi + %2], xmm0
 %endm
 
+SECTION .text
+
 ;void vpx_filter_block1d4_v8_sse2
 ;(
 ;    unsigned char *src_ptr,
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
index c1a6f23ab..952d9307d 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm
@@ -327,12 +327,12 @@ cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
 %endm
 
 INIT_XMM ssse3
-SUBPIX_HFILTER16 h8
-SUBPIX_HFILTER16 h8_avg
-SUBPIX_HFILTER8  h8
-SUBPIX_HFILTER8  h8_avg
-SUBPIX_HFILTER4  h8
-SUBPIX_HFILTER4  h8_avg
+SUBPIX_HFILTER16 h8      ; vpx_filter_block1d16_h8_ssse3
+SUBPIX_HFILTER16 h8_avg  ; vpx_filter_block1d16_h8_avg_ssse3
+SUBPIX_HFILTER8  h8      ; vpx_filter_block1d8_h8_ssse3
+SUBPIX_HFILTER8  h8_avg  ; vpx_filter_block1d8_h8_avg_ssse3
+SUBPIX_HFILTER4  h8      ; vpx_filter_block1d4_h8_ssse3
+SUBPIX_HFILTER4  h8_avg  ; vpx_filter_block1d4_h8_avg_ssse3
 
 ;-------------------------------------------------------------------------------
 
@@ -795,9 +795,9 @@ cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
 %endm
 
 INIT_XMM ssse3
-SUBPIX_VFILTER16     v8
-SUBPIX_VFILTER16 v8_avg
-SUBPIX_VFILTER       v8, 8
-SUBPIX_VFILTER   v8_avg, 8
-SUBPIX_VFILTER       v8, 4
-SUBPIX_VFILTER   v8_avg, 4
+SUBPIX_VFILTER16     v8     ; vpx_filter_block1d16_v8_ssse3
+SUBPIX_VFILTER16 v8_avg     ; vpx_filter_block1d16_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 8  ; vpx_filter_block1d8_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 8  ; vpx_filter_block1d8_v8_avg_ssse3
+SUBPIX_VFILTER       v8, 4  ; vpx_filter_block1d4_v8_ssse3
+SUBPIX_VFILTER   v8_avg, 4  ; vpx_filter_block1d4_v8_avg_ssse3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
index a378dd040..6d79492e4 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_sse2.asm
@@ -131,6 +131,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(vpx_filter_block1d4_v2_sse2) PRIVATE
 sym(vpx_filter_block1d4_v2_sse2):
     push        rbp
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
index 538b2129d..8c9c817be 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_bilinear_ssse3.asm
@@ -105,6 +105,8 @@
     dec         rcx
 %endm
 
+SECTION .text
+
 global sym(vpx_filter_block1d4_v2_ssse3) PRIVATE
 sym(vpx_filter_block1d4_v2_ssse3):
     push        rbp
diff --git a/libvpx/vpx_mem/vpx_mem.c b/libvpx/vpx_mem/vpx_mem.c
index a9be08680..eeba34c37 100644
--- a/libvpx/vpx_mem/vpx_mem.c
+++ b/libvpx/vpx_mem/vpx_mem.c
@@ -82,12 +82,3 @@ void vpx_free(void *memblk) {
     free(addr);
   }
 }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length) {
-  size_t i;
-  uint16_t *dest16 = (uint16_t *)dest;
-  for (i = 0; i < length; i++) *dest16++ = val;
-  return dest;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_mem/vpx_mem.h b/libvpx/vpx_mem/vpx_mem.h
index 733aff488..a4274b885 100644
--- a/libvpx/vpx_mem/vpx_mem.h
+++ b/libvpx/vpx_mem/vpx_mem.h
@@ -19,6 +19,8 @@
 #include <stdlib.h>
 #include <stddef.h>
 
+#include "vpx/vpx_integer.h"
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -29,7 +31,12 @@ void *vpx_calloc(size_t num, size_t size);
 void vpx_free(void *memblk);
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void *vpx_memset16(void *dest, int val, size_t length);
+static INLINE void *vpx_memset16(void *dest, int val, size_t length) {
+  size_t i;
+  uint16_t *dest16 = (uint16_t *)dest;
+  for (i = 0; i < length; i++) *dest16++ = val;
+  return dest;
+}
 #endif
 
 #include <string.h>
diff --git a/libvpx/vpx_ports/asmdefs_mmi.h b/libvpx/vpx_ports/asmdefs_mmi.h
new file mode 100644
index 000000000..a9a49745a
--- /dev/null
+++ b/libvpx/vpx_ports/asmdefs_mmi.h
@@ -0,0 +1,81 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_ASMDEFS_MMI_H_
+#define VPX_PORTS_ASMDEFS_MMI_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MMI
+
+#if HAVE_MIPS64
+#define mips_reg int64_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "daddu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "daddiu      " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "daddi       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "dsubu       " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "ld          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "dsrl        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "dsll        " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "dmtc1       " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "dli         " #reg ",        " #immediate "                     \n\t"
+
+#else
+#define mips_reg int32_t
+#define MMI_ADDU(reg1, reg2, reg3) \
+  "addu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_ADDIU(reg1, reg2, immediate) \
+  "addiu       " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_ADDI(reg1, reg2, immediate) \
+  "addi        " #reg1 ",       " #reg2 ",       " #immediate "    \n\t"
+
+#define MMI_SUBU(reg1, reg2, reg3) \
+  "subu        " #reg1 ",       " #reg2 ",       " #reg3 "         \n\t"
+
+#define MMI_L(reg, addr, bias) \
+  "lw          " #reg ",        " #bias "(" #addr ")               \n\t"
+
+#define MMI_SRL(reg1, reg2, shift) \
+  "srl         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_SLL(reg1, reg2, shift) \
+  "sll         " #reg1 ",       " #reg2 ",       " #shift "        \n\t"
+
+#define MMI_MTC1(reg, fp) \
+  "mtc1        " #reg ",        " #fp "                            \n\t"
+
+#define MMI_LI(reg, immediate) \
+  "li          " #reg ",        " #immediate "                     \n\t"
+
+#endif /* HAVE_MIPS64 */
+
+#endif /* HAVE_MMI */
+
+#endif /* VPX_PORTS_ASMDEFS_MMI_H_ */
diff --git a/libvpx/vpx_ports/vpx_ports.mk b/libvpx/vpx_ports/vpx_ports.mk
index fc0a783b7..e17145e6c 100644
--- a/libvpx/vpx_ports/vpx_ports.mk
+++ b/libvpx/vpx_ports/vpx_ports.mk
@@ -28,3 +28,7 @@ PORTS_SRCS-$(ARCH_ARM) += arm.h
 
 PORTS_SRCS-$(ARCH_PPC) += ppc_cpudetect.c
 PORTS_SRCS-$(ARCH_PPC) += ppc.h
+
+ifeq ($(ARCH_MIPS), yes)
+PORTS_SRCS-yes += asmdefs_mmi.h
+endif
diff --git a/libvpx/vpx_ports/x86.h b/libvpx/vpx_ports/x86.h
index 5aabb9e3a..ced65ac05 100644
--- a/libvpx/vpx_ports/x86.h
+++ b/libvpx/vpx_ports/x86.h
@@ -151,16 +151,17 @@ static INLINE uint64_t xgetbv(void) {
 #endif
 #endif
 
-#define HAS_MMX 0x01
-#define HAS_SSE 0x02
-#define HAS_SSE2 0x04
-#define HAS_SSE3 0x08
-#define HAS_SSSE3 0x10
-#define HAS_SSE4_1 0x20
-#define HAS_AVX 0x40
-#define HAS_AVX2 0x80
+#define HAS_MMX 0x001
+#define HAS_SSE 0x002
+#define HAS_SSE2 0x004
+#define HAS_SSE3 0x008
+#define HAS_SSSE3 0x010
+#define HAS_SSE4_1 0x020
+#define HAS_AVX 0x040
+#define HAS_AVX2 0x080
+#define HAS_AVX512 0x100
 #ifndef BIT
-#define BIT(n) (1 << n)
+#define BIT(n) (1u << n)
 #endif
 
 static INLINE int x86_simd_caps(void) {
@@ -209,6 +210,12 @@ static INLINE int x86_simd_caps(void) {
         cpuid(7, 0, reg_eax, reg_ebx, reg_ecx, reg_edx);
 
         if (reg_ebx & BIT(5)) flags |= HAS_AVX2;
+
+        // bits 16 (AVX-512F) & 17 (AVX-512DQ) & 28 (AVX-512CD) &
+        // 30 (AVX-512BW) & 32 (AVX-512VL)
+        if ((reg_ebx & (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31))) ==
+            (BIT(16) | BIT(17) | BIT(28) | BIT(30) | BIT(31)))
+          flags |= HAS_AVX512;
       }
     }
   }
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index a674eac84..9c7ca42c7 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -9,6 +9,7 @@
  */
 
 #include <assert.h>
+#include <limits.h>
 
 #include "vpx_scale/yv12config.h"
 #include "vpx_mem/vpx_mem.h"
@@ -165,6 +166,12 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
 
     uint8_t *buf = NULL;
 
+    // frame_size is stored in buffer_alloc_sz, which is an int. If it won't
+    // fit, fail early.
+    if (frame_size > INT_MAX) {
+      return -1;
+    }
+
     if (cb != NULL) {
       const int align_addr_extra_size = 31;
       const uint64_t external_frame_size = frame_size + align_addr_extra_size;
@@ -193,8 +200,6 @@ int vpx_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height,
       vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = NULL;
 
-      if (frame_size != (size_t)frame_size) return -1;
-
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, (size_t)frame_size);
       if (!ybf->buffer_alloc) return -1;
 
diff --git a/libvpx/vpx_scale/generic/yv12extend.c b/libvpx/vpx_scale/generic/yv12extend.c
index a6aaff95a..e23180650 100644
--- a/libvpx/vpx_scale/generic/yv12extend.c
+++ b/libvpx/vpx_scale/generic/yv12extend.c
@@ -111,25 +111,6 @@ void vp8_yv12_extend_frame_borders_c(YV12_BUFFER_CONFIG *ybf) {
   assert(ybf->y_height - ybf->y_crop_height >= 0);
   assert(ybf->y_width - ybf->y_crop_width >= 0);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    extend_plane_high(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
-                      ybf->y_crop_height, ybf->border, ybf->border,
-                      ybf->border + ybf->y_height - ybf->y_crop_height,
-                      ybf->border + ybf->y_width - ybf->y_crop_width);
-
-    extend_plane_high(ybf->u_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-
-    extend_plane_high(ybf->v_buffer, ybf->uv_stride, ybf->uv_crop_width,
-                      ybf->uv_crop_height, uv_border, uv_border,
-                      uv_border + ybf->uv_height - ybf->uv_crop_height,
-                      uv_border + ybf->uv_width - ybf->uv_crop_width);
-    return;
-  }
-#endif
   extend_plane(ybf->y_buffer, ybf->y_stride, ybf->y_crop_width,
                ybf->y_crop_height, ybf->border, ybf->border,
                ybf->border + ybf->y_height - ybf->y_crop_height,
@@ -208,6 +189,7 @@ static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
 // Note: The frames are assumed to be identical in size.
+
 void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
                            YV12_BUFFER_CONFIG *dst_ybc) {
   int row;
@@ -222,6 +204,48 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
   assert(src_ybc->y_height == dst_ybc->y_height);
 #endif
 
+  for (row = 0; row < src_ybc->y_height; ++row) {
+    memcpy(dst, src, src_ybc->y_width);
+    src += src_ybc->y_stride;
+    dst += dst_ybc->y_stride;
+  }
+
+  src = src_ybc->u_buffer;
+  dst = dst_ybc->u_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  src = src_ybc->v_buffer;
+  dst = dst_ybc->v_buffer;
+
+  for (row = 0; row < src_ybc->uv_height; ++row) {
+    memcpy(dst, src, src_ybc->uv_width);
+    src += src_ybc->uv_stride;
+    dst += dst_ybc->uv_stride;
+  }
+
+  vp8_yv12_extend_frame_borders_c(dst_ybc);
+}
+
+#if CONFIG_VP9
+void vpx_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
+                           YV12_BUFFER_CONFIG *dst_ybc) {
+  int row;
+  const uint8_t *src = src_ybc->y_buffer;
+  uint8_t *dst = dst_ybc->y_buffer;
+
+#if 0
+  /* These assertions are valid in the codec, but the libvpx-tester uses
+   * this code slightly differently.
+   */
+  assert(src_ybc->y_width == dst_ybc->y_width);
+  assert(src_ybc->y_height == dst_ybc->y_height);
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) {
     assert(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH);
@@ -249,7 +273,7 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
       dst += dst_ybc->uv_stride;
     }
 
-    vp8_yv12_extend_frame_borders_c(dst_ybc);
+    vpx_extend_frame_borders_c(dst_ybc);
     return;
   } else {
     assert(!(dst_ybc->flags & YV12_FLAG_HIGHBITDEPTH));
@@ -280,8 +304,9 @@ void vp8_yv12_copy_frame_c(const YV12_BUFFER_CONFIG *src_ybc,
     dst += dst_ybc->uv_stride;
   }
 
-  vp8_yv12_extend_frame_borders_c(dst_ybc);
+  vpx_extend_frame_borders_c(dst_ybc);
 }
+#endif  // CONFIG_VP9
 
 void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc,
                        YV12_BUFFER_CONFIG *dst_ybc) {
diff --git a/libvpx/vpx_scale/vpx_scale_rtcd.pl b/libvpx/vpx_scale/vpx_scale_rtcd.pl
index 44b115c7e..1281071a7 100644
--- a/libvpx/vpx_scale/vpx_scale_rtcd.pl
+++ b/libvpx/vpx_scale/vpx_scale_rtcd.pl
@@ -1,3 +1,13 @@
+##
+##  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+
 sub vpx_scale_forward_decls() {
 print <<EOF
 struct yv12_buffer_config;
@@ -23,6 +33,8 @@ add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_yb
 add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
 
 if (vpx_config("CONFIG_VP9") eq "yes") {
+    add_proto qw/void vpx_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
+
     add_proto qw/void vpx_extend_frame_borders/, "struct yv12_buffer_config *ybf";
     specialize qw/vpx_extend_frame_borders dspr2/;
 
diff --git a/libvpx/vpx_util/vpx_atomics.h b/libvpx/vpx_util/vpx_atomics.h
new file mode 100644
index 000000000..b8cf80dae
--- /dev/null
+++ b/libvpx/vpx_util/vpx_atomics.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_ATOMICS_H_
+#define VPX_UTIL_VPX_ATOMICS_H_
+
+#include "./vpx_config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD
+
+// Look for built-in atomic support. We cannot use <stdatomic.h> or <atomic>
+// since neither is guaranteed to exist on both C and C++ platforms, and we need
+// to back the atomic type with the same type (g++ needs to be able to use
+// gcc-built code). g++ 6 doesn't support _Atomic as a keyword and can't use the
+// stdatomic.h header. Even if both <stdatomic.h> and <atomic> existed it's not
+// guaranteed that atomic_int is the same type as std::atomic_int.
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60932#c13.
+#if !defined(__has_builtin)
+#define __has_builtin(x) 0  // Compatibility with non-clang compilers.
+#endif                      // !defined(__has_builtin)
+
+#if (__has_builtin(__atomic_load_n)) || \
+    (defined(__GNUC__) &&               \
+     (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)))
+// For GCC >= 4.7 and Clang versions that support __atomic builtins, use those.
+#define VPX_USE_ATOMIC_BUILTINS
+#else
+// Use platform-specific asm barriers.
+#if defined(_MSC_VER)
+// TODO(pbos): This assumes that newer versions of MSVC are building with the
+// default /volatile:ms (or older, where this is always true. Consider adding
+// support for using <atomic> instead of stdatomic.h when building C++11 under
+// MSVC. It's unclear what to do for plain C under /volatile:iso (inline asm?),
+// there're no explicit Interlocked* functions for only storing or loading
+// (presumably because volatile has historically implied that on MSVC).
+//
+// For earlier versions of MSVC or the default /volatile:ms volatile int are
+// acquire/release and require no barrier.
+#define vpx_atomic_memory_barrier() \
+  do {                              \
+  } while (0)
+#else
+#if ARCH_X86 || ARCH_X86_64
+// Use a compiler barrier on x86, no runtime penalty.
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("" ::: "memory")
+#elif ARCH_ARM
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("dmb ish" ::: "memory")
+#elif ARCH_MIPS
+#define vpx_atomic_memory_barrier() __asm__ __volatile__("sync" ::: "memory")
+#else
+#error Unsupported architecture!
+#endif  // ARCH_X86 || ARCH_X86_64
+#endif  // defined(_MSC_VER)
+#endif  // atomic builtin availability check
+
+// These are wrapped in a struct so that they are not easily accessed directly
+// on any platform (to discourage programmer errors by setting values directly).
+// This primitive MUST be initialized using vpx_atomic_init or VPX_ATOMIC_INIT
+// (NOT memset) and accessed through vpx_atomic_ functions.
+typedef struct vpx_atomic_int { volatile int value; } vpx_atomic_int;
+
+#define VPX_ATOMIC_INIT(num) \
+  { num }
+
+// Initialization of an atomic int, not thread safe.
+static INLINE void vpx_atomic_init(vpx_atomic_int *atomic, int value) {
+  atomic->value = value;
+}
+
+static INLINE void vpx_atomic_store_release(vpx_atomic_int *atomic, int value) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  __atomic_store_n(&atomic->value, value, __ATOMIC_RELEASE);
+#else
+  vpx_atomic_memory_barrier();
+  atomic->value = value;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+static INLINE int vpx_atomic_load_acquire(const vpx_atomic_int *atomic) {
+#if defined(VPX_USE_ATOMIC_BUILTINS)
+  return __atomic_load_n(&atomic->value, __ATOMIC_ACQUIRE);
+#else
+  int v = atomic->value;
+  vpx_atomic_memory_barrier();
+  return v;
+#endif  // defined(VPX_USE_ATOMIC_BUILTINS)
+}
+
+#undef VPX_USE_ATOMIC_BUILTINS
+#undef vpx_atomic_memory_barrier
+
+#endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // VPX_UTIL_VPX_ATOMICS_H_
diff --git a/libvpx/vpx_util/vpx_util.mk b/libvpx/vpx_util/vpx_util.mk
index c0ef8d336..86d3ece3c 100644
--- a/libvpx/vpx_util/vpx_util.mk
+++ b/libvpx/vpx_util/vpx_util.mk
@@ -8,7 +8,10 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+UTIL_SRCS-yes += vpx_atomics.h
 UTIL_SRCS-yes += vpx_util.mk
 UTIL_SRCS-yes += vpx_thread.c
 UTIL_SRCS-yes += vpx_thread.h
 UTIL_SRCS-yes += endian_inl.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.h
+UTIL_SRCS-yes += vpx_write_yuv_frame.c
diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.c b/libvpx/vpx_util/vpx_write_yuv_frame.c
new file mode 100644
index 000000000..ab6855811
--- /dev/null
+++ b/libvpx/vpx_util/vpx_write_yuv_frame.c
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_dsp/skin_detection.h"
+#include "vpx_util/vpx_write_yuv_frame.h"
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s) {
+#if defined(OUTPUT_YUV_SRC) || defined(OUTPUT_YUV_DENOISED) || \
+    defined(OUTPUT_YUV_SKINMAP)
+
+  unsigned char *src = s->y_buffer;
+  int h = s->y_crop_height;
+
+  do {
+    fwrite(src, s->y_width, 1, yuv_file);
+    src += s->y_stride;
+  } while (--h);
+
+  src = s->u_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+  src = s->v_buffer;
+  h = s->uv_crop_height;
+
+  do {
+    fwrite(src, s->uv_width, 1, yuv_file);
+    src += s->uv_stride;
+  } while (--h);
+
+#else
+  (void)yuv_file;
+  (void)s;
+#endif
+}
diff --git a/libvpx/vpx_util/vpx_write_yuv_frame.h b/libvpx/vpx_util/vpx_write_yuv_frame.h
new file mode 100644
index 000000000..1cb702981
--- /dev/null
+++ b/libvpx/vpx_util/vpx_write_yuv_frame.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+#define VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
+
+#include <stdio.h>
+#include "vpx_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vpx_write_yuv_frame(FILE *yuv_file, YV12_BUFFER_CONFIG *s);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_UTIL_VPX_WRITE_YUV_FRAME_H_
diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index 6db2afb4a..ff20e6a3c 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c
@@ -47,6 +47,8 @@ struct VpxDecInputContext {
   struct WebmInputContext *webm_ctx;
 };
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t looparg =
     ARG_DEF(NULL, "loops", 1, "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1, "Codec to use");
@@ -98,17 +100,17 @@ static const arg_def_t framestatsarg =
     ARG_DEF(NULL, "framestats", 1, "Output per-frame stats (.csv format)");
 
 static const arg_def_t *all_args[] = {
-  &codecarg,          &use_yv12,         &use_i420,
-  &flipuvarg,         &rawvideo,         &noblitarg,
-  &progressarg,       &limitarg,         &skiparg,
-  &postprocarg,       &summaryarg,       &outputfile,
-  &threadsarg,        &frameparallelarg, &verbosearg,
-  &scalearg,          &fb_arg,           &md5arg,
-  &error_concealment, &continuearg,
+  &help,           &codecarg,          &use_yv12,
+  &use_i420,       &flipuvarg,         &rawvideo,
+  &noblitarg,      &progressarg,       &limitarg,
+  &skiparg,        &postprocarg,       &summaryarg,
+  &outputfile,     &threadsarg,        &frameparallelarg,
+  &verbosearg,     &scalearg,          &fb_arg,
+  &md5arg,         &error_concealment, &continuearg,
 #if CONFIG_VP9_HIGHBITDEPTH
   &outbitdeptharg,
 #endif
-  &svcdecodingarg,    &framestatsarg,    NULL
+  &svcdecodingarg, &framestatsarg,     NULL
 };
 
 #if CONFIG_VP8_DECODER
@@ -152,41 +154,47 @@ static INLINE int libyuv_scale(vpx_image_t *src, vpx_image_t *dst,
                    dst->d_h, mode);
 }
 #endif
-
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
   int i;
 
-  fprintf(stderr,
-          "Usage: %s <options> filename\n\n"
-          "Options:\n",
-          exec_name);
-  arg_show_usage(stderr, all_args);
+  fprintf(fout, "Usage: %s <options> filename\n\n", exec_name);
+
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "Options:\n");
+  arg_show_usage(fout, all_args);
 #if CONFIG_VP8_DECODER
-  fprintf(stderr, "\nVP8 Postprocessing Options:\n");
-  arg_show_usage(stderr, vp8_pp_args);
+  fprintf(fout, "\nVP8 Postprocessing Options:\n");
+  arg_show_usage(fout, vp8_pp_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nOutput File Patterns:\n\n"
           "  The -o argument specifies the name of the file(s) to "
           "write to. If the\n  argument does not include any escape "
           "characters, the output will be\n  written to a single file. "
           "Otherwise, the filename will be calculated by\n  expanding "
           "the following escape characters:\n");
-  fprintf(stderr,
+  fprintf(fout,
           "\n\t%%w   - Frame width"
           "\n\t%%h   - Frame height"
           "\n\t%%<n> - Frame number, zero padded to <n> places (1..9)"
           "\n\n  Pattern arguments are only supported in conjunction "
           "with the --yv12 and\n  --i420 options. If the -o option is "
           "not specified, the output will be\n  directed to stdout.\n");
-  fprintf(stderr, "\nIncluded decoders:\n\n");
+  fprintf(fout, "\nIncluded decoders:\n\n");
 
   for (i = 0; i < get_vpx_decoder_count(); ++i) {
     const VpxInterface *const decoder = get_vpx_decoder_by_index(i);
-    fprintf(stderr, "    %-6s - %s\n", decoder->name,
+    fprintf(fout, "    %-6s - %s\n", decoder->name,
             vpx_codec_iface_name(decoder->codec_interface()));
   }
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -554,7 +562,10 @@ static int main_loop(int argc, const char **argv_) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       interface = get_vpx_decoder_by_name(arg.val);
       if (!interface)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -651,6 +662,7 @@ static int main_loop(int argc, const char **argv_) {
 
   if (!fn) {
     free(argv);
+    fprintf(stderr, "No input file specified!\n");
     usage_exit();
   }
   /* Open file */
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 6c887dfeb..4db7eccc3 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -123,6 +123,8 @@ static int fourcc_is_ivf(const char detect[4]) {
   return 0;
 }
 
+static const arg_def_t help =
+    ARG_DEF(NULL, "help", 0, "Show usage options and exit");
 static const arg_def_t debugmode =
     ARG_DEF("D", "debug", 0, "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile =
@@ -199,7 +201,8 @@ static const arg_def_t test16bitinternalarg = ARG_DEF(
     NULL, "test-16bit-internal", 0, "Force use of 16 bit internal buffer");
 #endif
 
-static const arg_def_t *main_args[] = { &debugmode,
+static const arg_def_t *main_args[] = { &help,
+                                        &debugmode,
                                         &outputfile,
                                         &codecarg,
                                         &passes,
@@ -321,8 +324,11 @@ static const arg_def_t minsection_pct =
     ARG_DEF(NULL, "minsection-pct", 1, "GOP min bitrate (% of target)");
 static const arg_def_t maxsection_pct =
     ARG_DEF(NULL, "maxsection-pct", 1, "GOP max bitrate (% of target)");
-static const arg_def_t *rc_twopass_args[] = { &bias_pct, &minsection_pct,
-                                              &maxsection_pct, NULL };
+static const arg_def_t corpus_complexity =
+    ARG_DEF(NULL, "corpus-complexity", 1, "corpus vbr complexity midpoint");
+static const arg_def_t *rc_twopass_args[] = {
+  &bias_pct, &minsection_pct, &maxsection_pct, &corpus_complexity, NULL
+};
 
 static const arg_def_t kf_min_dist =
     ARG_DEF(NULL, "kf-min-dist", 1, "Minimum keyframe interval (frames)");
@@ -441,8 +447,8 @@ static const struct arg_enum_list color_space_enum[] = {
 };
 
 static const arg_def_t input_color_space =
-    ARG_DEF_ENUM(NULL, "color-space", 1, "The color space of input content:",
-                 color_space_enum);
+    ARG_DEF_ENUM(NULL, "color-space", 1,
+                 "The color space of input content:", color_space_enum);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static const struct arg_enum_list bitdepth_enum[] = {
@@ -460,6 +466,7 @@ static const arg_def_t inbitdeptharg =
 static const struct arg_enum_list tune_content_enum[] = {
   { "default", VP9E_CONTENT_DEFAULT },
   { "screen", VP9E_CONTENT_SCREEN },
+  { "film", VP9E_CONTENT_FILM },
   { NULL, 0 }
 };
 
@@ -468,8 +475,14 @@ static const arg_def_t tune_content = ARG_DEF_ENUM(
 
 static const arg_def_t target_level = ARG_DEF(
     NULL, "target-level", 1,
-    "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
-    " 11: level 1.1; ... 62: level 6.2)");
+    "Target level\n"
+    "                                        255: off (default)\n"
+    "                                          0: only keep level stats\n"
+    "                                          1: adaptively set alt-ref "
+    "distance and column tile limit based on picture size, and keep"
+    " level stats\n"
+    "                                         10: level 1.0  11: level 1.1  "
+    "...  62: level 6.2");
 
 static const arg_def_t row_mt =
     ARG_DEF(NULL, "row-mt", 1,
@@ -539,46 +552,54 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
 
 static const arg_def_t *no_args[] = { NULL };
 
-void usage_exit(void) {
+void show_help(FILE *fout, int shorthelp) {
   int i;
   const int num_encoder = get_vpx_encoder_count();
 
-  fprintf(stderr, "Usage: %s <options> -o dst_filename src_filename \n",
+  fprintf(fout, "Usage: %s <options> -o dst_filename src_filename \n",
           exec_name);
 
-  fprintf(stderr, "\nOptions:\n");
-  arg_show_usage(stderr, main_args);
-  fprintf(stderr, "\nEncoder Global Options:\n");
-  arg_show_usage(stderr, global_args);
-  fprintf(stderr, "\nRate Control Options:\n");
-  arg_show_usage(stderr, rc_args);
-  fprintf(stderr, "\nTwopass Rate Control Options:\n");
-  arg_show_usage(stderr, rc_twopass_args);
-  fprintf(stderr, "\nKeyframe Placement Options:\n");
-  arg_show_usage(stderr, kf_args);
+  if (shorthelp) {
+    fprintf(fout, "Use --help to see the full list of options.\n");
+    return;
+  }
+
+  fprintf(fout, "\nOptions:\n");
+  arg_show_usage(fout, main_args);
+  fprintf(fout, "\nEncoder Global Options:\n");
+  arg_show_usage(fout, global_args);
+  fprintf(fout, "\nRate Control Options:\n");
+  arg_show_usage(fout, rc_args);
+  fprintf(fout, "\nTwopass Rate Control Options:\n");
+  arg_show_usage(fout, rc_twopass_args);
+  fprintf(fout, "\nKeyframe Placement Options:\n");
+  arg_show_usage(fout, kf_args);
 #if CONFIG_VP8_ENCODER
-  fprintf(stderr, "\nVP8 Specific Options:\n");
-  arg_show_usage(stderr, vp8_args);
+  fprintf(fout, "\nVP8 Specific Options:\n");
+  arg_show_usage(fout, vp8_args);
 #endif
 #if CONFIG_VP9_ENCODER
-  fprintf(stderr, "\nVP9 Specific Options:\n");
-  arg_show_usage(stderr, vp9_args);
+  fprintf(fout, "\nVP9 Specific Options:\n");
+  arg_show_usage(fout, vp9_args);
 #endif
-  fprintf(stderr,
+  fprintf(fout,
           "\nStream timebase (--timebase):\n"
           "  The desired precision of timestamps in the output, expressed\n"
           "  in fractional seconds. Default is 1/1000.\n");
-  fprintf(stderr, "\nIncluded encoders:\n\n");
+  fprintf(fout, "\nIncluded encoders:\n\n");
 
   for (i = 0; i < num_encoder; ++i) {
     const VpxInterface *const encoder = get_vpx_encoder_by_index(i);
     const char *defstr = (i == (num_encoder - 1)) ? "(default)" : "";
-    fprintf(stderr, "    %-6s - %s %s\n", encoder->name,
+    fprintf(fout, "    %-6s - %s %s\n", encoder->name,
             vpx_codec_iface_name(encoder->codec_interface()), defstr);
   }
-  fprintf(stderr, "\n        ");
-  fprintf(stderr, "Use --codec to switch to a non-default encoder.\n\n");
+  fprintf(fout, "\n        ");
+  fprintf(fout, "Use --codec to switch to a non-default encoder.\n\n");
+}
 
+void usage_exit(void) {
+  show_help(stderr, 1);
   exit(EXIT_FAILURE);
 }
 
@@ -893,7 +914,10 @@ static void parse_global_config(struct VpxEncoderConfig *global, char **argv) {
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
-    if (arg_match(&arg, &codecarg, argi)) {
+    if (arg_match(&arg, &help, argi)) {
+      show_help(stdout, 0);
+      exit(EXIT_SUCCESS);
+    } else if (arg_match(&arg, &codecarg, argi)) {
       global->codec = get_vpx_encoder_by_name(arg.val);
       if (!global->codec)
         die("Error: Unrecognized argument (%s) to --codec\n", arg.val);
@@ -1229,6 +1253,11 @@ static int parse_stream_params(struct VpxEncoderConfig *global,
 
       if (global->passes < 2)
         warn("option %s ignored in one-pass mode.\n", arg.name);
+    } else if (arg_match(&arg, &corpus_complexity, argi)) {
+      config->cfg.rc_2pass_vbr_corpus_complexity = arg_parse_uint(&arg);
+
+      if (global->passes < 2)
+        warn("option %s ignored in one-pass mode.\n", arg.name);
     } else if (arg_match(&arg, &kf_min_dist, argi)) {
       config->cfg.kf_min_dist = arg_parse_uint(&arg);
     } else if (arg_match(&arg, &kf_max_dist, argi)) {
@@ -1425,6 +1454,7 @@ static void show_stream_config(struct stream_state *stream,
   SHOW(rc_2pass_vbr_bias_pct);
   SHOW(rc_2pass_vbr_minsection_pct);
   SHOW(rc_2pass_vbr_maxsection_pct);
+  SHOW(rc_2pass_vbr_corpus_complexity);
   SHOW(kf_mode);
   SHOW(kf_min_dist);
   SHOW(kf_max_dist);
@@ -1889,8 +1919,6 @@ int main(int argc, const char **argv_) {
   memset(&input, 0, sizeof(input));
   exec_name = argv_[0];
 
-  if (argc < 3) usage_exit();
-
   /* Setup default input stream settings */
   input.framerate.numerator = 30;
   input.framerate.denominator = 1;
@@ -1904,6 +1932,8 @@ int main(int argc, const char **argv_) {
   argv = argv_dup(argc - 1, argv_ + 1);
   parse_global_config(&global, argv);
 
+  if (argc < 3) usage_exit();
+
   switch (global.color_type) {
     case I420: input.fmt = VPX_IMG_FMT_I420; break;
     case I422: input.fmt = VPX_IMG_FMT_I422; break;
@@ -1937,7 +1967,10 @@ int main(int argc, const char **argv_) {
   /* Handle non-option arguments */
   input.filename = argv[0];
 
-  if (!input.filename) usage_exit();
+  if (!input.filename) {
+    fprintf(stderr, "No input file specified!\n");
+    usage_exit();
+  }
 
   /* Decide if other chroma subsamplings than 4:2:0 are supported */
   if (global.codec->fourcc == VP9_FOURCC) input.only_i420 = 0;
diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c
index acf7d69fe..1de636cc0 100644
--- a/libvpx/y4minput.c
+++ b/libvpx/y4minput.c
@@ -195,26 +195,29 @@ static void y4m_42xmpeg2_42xjpeg_helper(unsigned char *_dst,
        window.*/
     for (x = 0; x < OC_MINI(_c_w, 2); x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] +
-              _src[OC_MINI(x + 3, _c_w - 1)] + 64) >>
-                 7,
+          0,
+          (4 * _src[0] - 17 * _src[OC_MAXI(x - 1, 0)] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[OC_MINI(x + 3, _c_w - 1)] +
+           64) >>
+              7,
           255);
     }
     for (; x < _c_w - 3; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[x + 1] - 9 * _src[x + 2] + _src[x + 3] + 64) >>
+              7,
           255);
     }
     for (; x < _c_w; x++) {
       _dst[x] = (unsigned char)OC_CLAMPI(
-          0, (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
-              35 * _src[OC_MINI(x + 1, _c_w - 1)] -
-              9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
-                 7,
+          0,
+          (4 * _src[x - 2] - 17 * _src[x - 1] + 114 * _src[x] +
+           35 * _src[OC_MINI(x + 1, _c_w - 1)] -
+           9 * _src[OC_MINI(x + 2, _c_w - 1)] + _src[_c_w - 1] + 64) >>
+              7,
           255);
     }
     _dst += _c_w;
@@ -314,28 +317,31 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
         for (x = 0; x < c_w; x++) {
           for (y = 0; y < OC_MINI(c_h, 3); y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
-                    35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[0] - 9 * tmp[OC_MAXI(y - 2, 0) * c_w] +
+                 35 * tmp[OC_MAXI(y - 1, 0) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[OC_MINI(y + 2, c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h - 2; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[(y + 1) * c_w] + 4 * tmp[(y + 2) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
-                    35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
-                    17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
-                    4 * tmp[(c_h - 1) * c_w] + 64) >>
-                       7,
+                0,
+                (tmp[(y - 3) * c_w] - 9 * tmp[(y - 2) * c_w] +
+                 35 * tmp[(y - 1) * c_w] + 114 * tmp[y * c_w] -
+                 17 * tmp[OC_MINI(y + 1, c_h - 1) * c_w] +
+                 4 * tmp[(c_h - 1) * c_w] + 64) >>
+                    7,
                 255);
           }
           _dst++;
@@ -361,10 +367,11 @@ static void y4m_convert_42xpaldv_42xjpeg(y4m_input *_y4m, unsigned char *_dst,
           }
           for (; y < c_h - 3; y++) {
             _dst[y * c_w] = (unsigned char)OC_CLAMPI(
-                0, (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
-                    114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
-                    9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
-                       7,
+                0,
+                (4 * tmp[(y - 2) * c_w] - 17 * tmp[(y - 1) * c_w] +
+                 114 * tmp[y * c_w] + 35 * tmp[(y + 1) * c_w] -
+                 9 * tmp[(y + 2) * c_w] + tmp[(y + 3) * c_w] + 64) >>
+                    7,
                 255);
           }
           for (; y < c_h; y++) {
@@ -404,18 +411,20 @@ static void y4m_422jpeg_420jpeg_helper(unsigned char *_dst,
   for (x = 0; x < _c_w; x++) {
     for (y = 0; y < OC_MINI(_c_h, 2); y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
-                        17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
-                        3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (64 * _src[0] + 78 * _src[OC_MINI(1, _c_h - 1) * _c_w] -
+                     17 * _src[OC_MINI(2, _c_h - 1) * _c_w] +
+                     3 * _src[OC_MINI(3, _c_h - 1) * _c_w] + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h - 3; y += 2) {
       _dst[(y >> 1) * _c_w] =
-          OC_CLAMPI(0, (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
-                        17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
-                        78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
-                           7,
+          OC_CLAMPI(0,
+                    (3 * (_src[(y - 2) * _c_w] + _src[(y + 3) * _c_w]) -
+                     17 * (_src[(y - 1) * _c_w] + _src[(y + 2) * _c_w]) +
+                     78 * (_src[y * _c_w] + _src[(y + 1) * _c_w]) + 64) >>
+                        7,
                     255);
     }
     for (; y < _c_h; y += 2) {
@@ -642,33 +651,38 @@ static void y4m_convert_411_420jpeg(y4m_input *_y4m, unsigned char *_dst,
          4-tap Mitchell window.*/
       for (x = 0; x < OC_MINI(c_w, 1); x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
-                _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (111 * _aux[0] + 18 * _aux[OC_MINI(1, c_w - 1)] -
+             _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
-                5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
-                   7,
+            0,
+            (47 * _aux[0] + 86 * _aux[OC_MINI(1, c_w - 1)] -
+             5 * _aux[OC_MINI(2, c_w - 1)] + 64) >>
+                7,
             255);
       }
       for (; x < c_w - 2; x++) {
         tmp[x << 1] =
-            (unsigned char)OC_CLAMPI(0, (_aux[x - 1] + 110 * _aux[x] +
-                                         18 * _aux[x + 1] - _aux[x + 2] + 64) >>
-                                            7,
+            (unsigned char)OC_CLAMPI(0,
+                                     (_aux[x - 1] + 110 * _aux[x] +
+                                      18 * _aux[x + 1] - _aux[x + 2] + 64) >>
+                                         7,
                                      255);
         tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
-            0, (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
-                5 * _aux[x + 2] + 64) >>
-                   7,
+            0,
+            (-3 * _aux[x - 1] + 50 * _aux[x] + 86 * _aux[x + 1] -
+             5 * _aux[x + 2] + 64) >>
+                7,
             255);
       }
       for (; x < c_w; x++) {
         tmp[x << 1] = (unsigned char)OC_CLAMPI(
-            0, (_aux[x - 1] + 110 * _aux[x] +
-                18 * _aux[OC_MINI(x + 1, c_w - 1)] - _aux[c_w - 1] + 64) >>
-                   7,
+            0,
+            (_aux[x - 1] + 110 * _aux[x] + 18 * _aux[OC_MINI(x + 1, c_w - 1)] -
+             _aux[c_w - 1] + 64) >>
+                7,
             255);
         if ((x << 1 | 1) < dst_c_w) {
           tmp[x << 1 | 1] = (unsigned char)OC_CLAMPI(
@@ -718,27 +732,29 @@ static void y4m_convert_444_420jpeg(y4m_input *_y4m, unsigned char *_dst,
     /*Filter: [3 -17 78 78 -17 3]/128, derived from a 6-tap Lanczos window.*/
     for (y = 0; y < c_h; y++) {
       for (x = 0; x < OC_MINI(c_w, 2); x += 2) {
-        tmp[x >> 1] =
-            OC_CLAMPI(0, (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
-                          17 * _aux[OC_MINI(2, c_w - 1)] +
-                          3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
-                             7,
-                      255);
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (64 * _aux[0] + 78 * _aux[OC_MINI(1, c_w - 1)] -
+                                 17 * _aux[OC_MINI(2, c_w - 1)] +
+                                 3 * _aux[OC_MINI(3, c_w - 1)] + 64) >>
+                                    7,
+                                255);
       }
       for (; x < c_w - 3; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(0, (3 * (_aux[x - 2] + _aux[x + 3]) -
-                                    17 * (_aux[x - 1] + _aux[x + 2]) +
-                                    78 * (_aux[x] + _aux[x + 1]) + 64) >>
-                                       7,
+        tmp[x >> 1] = OC_CLAMPI(0,
+                                (3 * (_aux[x - 2] + _aux[x + 3]) -
+                                 17 * (_aux[x - 1] + _aux[x + 2]) +
+                                 78 * (_aux[x] + _aux[x + 1]) + 64) >>
+                                    7,
                                 255);
       }
       for (; x < c_w; x += 2) {
-        tmp[x >> 1] = OC_CLAMPI(
-            0, (3 * (_aux[x - 2] + _aux[c_w - 1]) -
-                17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
-                78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
-                   7,
-            255);
+        tmp[x >> 1] =
+            OC_CLAMPI(0,
+                      (3 * (_aux[x - 2] + _aux[c_w - 1]) -
+                       17 * (_aux[x - 1] + _aux[OC_MINI(x + 2, c_w - 1)]) +
+                       78 * (_aux[x] + _aux[OC_MINI(x + 1, c_w - 1)]) + 64) >>
+                          7,
+                      255);
       }
       tmp += dst_c_w;
       _aux += c_w;
diff --git a/libwebm/Android.bp b/libwebm/Android.bp
index a1d335ebc..753746e84 100644
--- a/libwebm/Android.bp
+++ b/libwebm/Android.bp
@@ -1,6 +1,7 @@
 cc_library_static {
     name: "libwebm",
     srcs: ["mkvparser/mkvparser.cc"],
+    cflags: ["-Wall", "-Werror"],
     export_include_dirs: ["."],
     sanitize: {
         cfi: true,