367 files changed, 28279 insertions, 11556 deletions
diff --git a/.mailmap b/.mailmap
index bb0ddd95b..7206b5ebe 100644
--- a/.mailmap
+++ b/.mailmap
@@ -20,6 +20,7 @@ Hui Su <huisu@google.com>
 Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
 Johann <johann@duck.com> <johann.koenig@gmail.com>
@@ -53,4 +54,4 @@ Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>
 Yaowu Xu <yaowu@google.com> <Yaowu Xu>
 Venkatarama NG. Avadhani <venkatarama.avadhani@ittiam.com>
 Vitaly Buka <vitalybuka@chromium.org> <vitlaybuka@chromium.org>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiwei Gu <guxiwei-hf@loongson.cn>
diff --git a/AUTHORS b/AUTHORS
index 2db4a113e..5515e2658 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -25,6 +25,7 @@ Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
 Anton Venema <anton.venema@liveswitch.com>
+Anupam Pandey <anupam.pandey@ittiam.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
@@ -34,6 +35,8 @@ Brion Vibber <bvibber@wikimedia.org>
 changjun.yang <changjun.yang@intel.com>
 Charles 'Buck' Krasic <ckrasic@google.com>
 Cheng Chen <chengchen@google.com>
+Chen Wang <wangchen20@iscas.ac.cn>
+Cherma Rajan A <cherma.rajan@ittiam.com>
 Chi Yo Tsai <chiyotsai@google.com>
 chm <chm@rock-chips.com>
 Chris Cunningham <chcunningham@chromium.org>
@@ -60,6 +63,8 @@ Fritz Koenig <frkoenig@google.com>
 Fyodor Kyslov <kyslov@google.com>
 Gabriel Marin <gmx@chromium.org>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
+George Steed <george.steed@arm.com>
+Gerda Zsejke More <gerdazsejke.more@arm.com>
 Geza Lore <gezalore@gmail.com>
 Ghislain MARY <ghislainmary2@gmail.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
@@ -103,6 +108,7 @@ Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
 Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
+Johann <johann@duck.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Johnny Klonaris <google@jawknee.com>
@@ -120,6 +126,7 @@ KO Myung-Hun <komh@chollian.net>
 Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
+L. E. Segovia <amy@amyspark.me>
 Linfeng Zhang <linfengz@google.com>
 Liu Peng <pengliu.mail@gmail.com>
 Lou Quillio <louquillio@google.com>
@@ -147,6 +154,7 @@ Mirko Bonadei <mbonadei@google.com>
 Moriyoshi Koizumi <mozo@mozo.jp>
 Morton Jonuschat <yabawock@gmail.com>
 Nathan E. Egge <negge@mozilla.com>
+Neeraj Gadgil <neeraj.gadgil@ittiam.com>
 Neil Birkbeck <neil.birkbeck@gmail.com>
 Nico Weber <thakis@chromium.org>
 Niveditha Rau <niveditha.rau@gmail.com>
@@ -213,7 +221,8 @@ Vitaly Buka <vitalybuka@chromium.org>
 Vlad Tsyrklevich <vtsyrklevich@chromium.org>
 Wan-Teh Chang <wtc@google.com>
 Wonkap Jang <wonkap@google.com>
-xiwei gu <guxiwei-hf@loongson.cn>
+Xiahong Bao <xiahong.bao@nxp.com>
+Xiwei Gu <guxiwei-hf@loongson.cn>
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
diff --git a/Android.bp b/Android.bp
index 952765b90..b770ff9a0 100644
--- a/Android.bp
+++ b/Android.bp
@@ -111,7 +111,9 @@ libvpx_arm_neon_c_srcs = [
     "vp9/decoder/vp9_job_queue.c",
     "vp9/encoder/arm/neon/vp9_dct_neon.c",
     "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
+    "vp9/encoder/arm/neon/vp9_error_neon.c",
     "vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
+    "vp9/encoder/arm/neon/vp9_highbd_error_neon.c",
     "vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "vp9/encoder/vp9_aq_cyclicrefresh.c",
     "vp9/encoder/vp9_bitstream.c",
@@ -143,6 +145,7 @@ libvpx_arm_neon_c_srcs = [
     "vp9/encoder/vp9_subexp.c",
     "vp9/encoder/vp9_svc_layercontext.c",
     "vp9/encoder/vp9_tokenize.c",
+    "vp9/encoder/vp9_tpl_model.c",
     "vp9/encoder/vp9_treewriter.c",
     "vp9/vp9_cx_iface.c",
     "vp9/vp9_dx_iface.c",
@@ -151,6 +154,7 @@ libvpx_arm_neon_c_srcs = [
     "vpx/src/vpx_decoder.c",
     "vpx/src/vpx_encoder.c",
     "vpx/src/vpx_image.c",
+    "vpx/src/vpx_tpl.c",
     "vpx_dsp/arm/avg_neon.c",
     "vpx_dsp/arm/avg_pred_neon.c",
     "vpx_dsp/arm/fdct4x4_neon.c",
@@ -159,6 +163,9 @@ libvpx_arm_neon_c_srcs = [
     "vpx_dsp/arm/fdct32x32_neon.c",
     "vpx_dsp/arm/fdct_partial_neon.c",
     "vpx_dsp/arm/hadamard_neon.c",
+    "vpx_dsp/arm/highbd_avg_neon.c",
+    "vpx_dsp/arm/highbd_avg_pred_neon.c",
+    "vpx_dsp/arm/highbd_hadamard_neon.c",
     "vpx_dsp/arm/highbd_idct4x4_add_neon.c",
     "vpx_dsp/arm/highbd_idct8x8_add_neon.c",
     "vpx_dsp/arm/highbd_idct16x16_add_neon.c",
@@ -169,7 +176,10 @@ libvpx_arm_neon_c_srcs = [
     "vpx_dsp/arm/highbd_intrapred_neon.c",
     "vpx_dsp/arm/highbd_loopfilter_neon.c",
     "vpx_dsp/arm/highbd_quantize_neon.c",
+    "vpx_dsp/arm/highbd_sad4d_neon.c",
     "vpx_dsp/arm/highbd_sad_neon.c",
+    "vpx_dsp/arm/highbd_sse_neon.c",
+    "vpx_dsp/arm/highbd_subpel_variance_neon.c",
     "vpx_dsp/arm/highbd_variance_neon.c",
     "vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
     "vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
@@ -187,6 +197,7 @@ libvpx_arm_neon_c_srcs = [
     "vpx_dsp/arm/quantize_neon.c",
     "vpx_dsp/arm/sad4d_neon.c",
     "vpx_dsp/arm/sad_neon.c",
+    "vpx_dsp/arm/sse_neon.c",
     "vpx_dsp/arm/subpel_variance_neon.c",
     "vpx_dsp/arm/subtract_neon.c",
     "vpx_dsp/arm/sum_squares_neon.c",
@@ -208,13 +219,14 @@ libvpx_arm_neon_c_srcs = [
     "vpx_dsp/quantize.c",
     "vpx_dsp/sad.c",
     "vpx_dsp/skin_detection.c",
+    "vpx_dsp/sse.c",
     "vpx_dsp/subtract.c",
     "vpx_dsp/sum_squares.c",
     "vpx_dsp/variance.c",
     "vpx_dsp/vpx_convolve.c",
     "vpx_dsp/vpx_dsp_rtcd.c",
     "vpx_mem/vpx_mem.c",
-    "vpx_ports/arm_cpudetect.c",
+    "vpx_ports/aarch32_cpudetect.c",
     "vpx_scale/generic/gen_scalers.c",
     "vpx_scale/generic/vpx_scale.c",
     "vpx_scale/generic/yv12config.c",
@@ -355,7 +367,9 @@ libvpx_arm64_c_srcs = [
     "vp9/decoder/vp9_job_queue.c",
     "vp9/encoder/arm/neon/vp9_dct_neon.c",
     "vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
+    "vp9/encoder/arm/neon/vp9_error_neon.c",
     "vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
+    "vp9/encoder/arm/neon/vp9_highbd_error_neon.c",
     "vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "vp9/encoder/vp9_aq_cyclicrefresh.c",
     "vp9/encoder/vp9_bitstream.c",
@@ -387,6 +401,7 @@ libvpx_arm64_c_srcs = [
     "vp9/encoder/vp9_subexp.c",
     "vp9/encoder/vp9_svc_layercontext.c",
     "vp9/encoder/vp9_tokenize.c",
+    "vp9/encoder/vp9_tpl_model.c",
     "vp9/encoder/vp9_treewriter.c",
     "vp9/vp9_cx_iface.c",
     "vp9/vp9_dx_iface.c",
@@ -395,6 +410,7 @@ libvpx_arm64_c_srcs = [
     "vpx/src/vpx_decoder.c",
     "vpx/src/vpx_encoder.c",
     "vpx/src/vpx_image.c",
+    "vpx/src/vpx_tpl.c",
     "vpx_dsp/arm/avg_neon.c",
     "vpx_dsp/arm/avg_pred_neon.c",
     "vpx_dsp/arm/fdct4x4_neon.c",
@@ -403,6 +419,9 @@ libvpx_arm64_c_srcs = [
     "vpx_dsp/arm/fdct32x32_neon.c",
     "vpx_dsp/arm/fdct_partial_neon.c",
     "vpx_dsp/arm/hadamard_neon.c",
+    "vpx_dsp/arm/highbd_avg_neon.c",
+    "vpx_dsp/arm/highbd_avg_pred_neon.c",
+    "vpx_dsp/arm/highbd_hadamard_neon.c",
     "vpx_dsp/arm/highbd_idct4x4_add_neon.c",
     "vpx_dsp/arm/highbd_idct8x8_add_neon.c",
     "vpx_dsp/arm/highbd_idct16x16_add_neon.c",
@@ -413,7 +432,10 @@ libvpx_arm64_c_srcs = [
     "vpx_dsp/arm/highbd_intrapred_neon.c",
     "vpx_dsp/arm/highbd_loopfilter_neon.c",
     "vpx_dsp/arm/highbd_quantize_neon.c",
+    "vpx_dsp/arm/highbd_sad4d_neon.c",
     "vpx_dsp/arm/highbd_sad_neon.c",
+    "vpx_dsp/arm/highbd_sse_neon.c",
+    "vpx_dsp/arm/highbd_subpel_variance_neon.c",
     "vpx_dsp/arm/highbd_variance_neon.c",
     "vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
     "vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
@@ -434,6 +456,7 @@ libvpx_arm64_c_srcs = [
     "vpx_dsp/arm/quantize_neon.c",
     "vpx_dsp/arm/sad4d_neon.c",
     "vpx_dsp/arm/sad_neon.c",
+    "vpx_dsp/arm/sse_neon.c",
     "vpx_dsp/arm/subpel_variance_neon.c",
     "vpx_dsp/arm/subtract_neon.c",
     "vpx_dsp/arm/sum_squares_neon.c",
@@ -457,13 +480,14 @@ libvpx_arm64_c_srcs = [
     "vpx_dsp/quantize.c",
     "vpx_dsp/sad.c",
     "vpx_dsp/skin_detection.c",
+    "vpx_dsp/sse.c",
     "vpx_dsp/subtract.c",
     "vpx_dsp/sum_squares.c",
     "vpx_dsp/variance.c",
     "vpx_dsp/vpx_convolve.c",
     "vpx_dsp/vpx_dsp_rtcd.c",
     "vpx_mem/vpx_mem.c",
-    "vpx_ports/arm_cpudetect.c",
+    "vpx_ports/aarch64_cpudetect.c",
     "vpx_scale/generic/gen_scalers.c",
     "vpx_scale/generic/vpx_scale.c",
     "vpx_scale/generic/yv12config.c",
@@ -587,6 +611,7 @@ libvpx_generic_c_srcs = [
     "vp9/encoder/vp9_subexp.c",
     "vp9/encoder/vp9_svc_layercontext.c",
     "vp9/encoder/vp9_tokenize.c",
+    "vp9/encoder/vp9_tpl_model.c",
     "vp9/encoder/vp9_treewriter.c",
     "vp9/vp9_cx_iface.c",
     "vp9/vp9_dx_iface.c",
@@ -595,6 +620,7 @@ libvpx_generic_c_srcs = [
     "vpx/src/vpx_decoder.c",
     "vpx/src/vpx_encoder.c",
     "vpx/src/vpx_image.c",
+    "vpx/src/vpx_tpl.c",
     "vpx_dsp/avg.c",
     "vpx_dsp/bitreader.c",
     "vpx_dsp/bitreader_buffer.c",
@@ -609,6 +635,7 @@ libvpx_generic_c_srcs = [
     "vpx_dsp/quantize.c",
     "vpx_dsp/sad.c",
     "vpx_dsp/skin_detection.c",
+    "vpx_dsp/sse.c",
     "vpx_dsp/subtract.c",
     "vpx_dsp/sum_squares.c",
     "vpx_dsp/variance.c",
@@ -750,6 +777,7 @@ libvpx_x86_c_srcs = [
     "vp9/encoder/vp9_subexp.c",
     "vp9/encoder/vp9_svc_layercontext.c",
     "vp9/encoder/vp9_tokenize.c",
+    "vp9/encoder/vp9_tpl_model.c",
     "vp9/encoder/vp9_treewriter.c",
     "vp9/encoder/x86/vp9_dct_intrin_sse2.c",
     "vp9/encoder/x86/vp9_frame_scale_ssse3.c",
@@ -763,6 +791,7 @@ libvpx_x86_c_srcs = [
     "vpx/src/vpx_decoder.c",
     "vpx/src/vpx_encoder.c",
     "vpx/src/vpx_image.c",
+    "vpx/src/vpx_tpl.c",
     "vpx_dsp/add_noise.c",
     "vpx_dsp/avg.c",
     "vpx_dsp/bitreader.c",
@@ -779,6 +808,7 @@ libvpx_x86_c_srcs = [
     "vpx_dsp/quantize.c",
     "vpx_dsp/sad.c",
     "vpx_dsp/skin_detection.c",
+    "vpx_dsp/sse.c",
     "vpx_dsp/subtract.c",
     "vpx_dsp/sum_squares.c",
     "vpx_dsp/variance.c",
@@ -986,6 +1016,7 @@ libvpx_x86_64_c_srcs = [
     "vp9/encoder/vp9_subexp.c",
     "vp9/encoder/vp9_svc_layercontext.c",
     "vp9/encoder/vp9_tokenize.c",
+    "vp9/encoder/vp9_tpl_model.c",
     "vp9/encoder/vp9_treewriter.c",
     "vp9/encoder/x86/vp9_dct_intrin_sse2.c",
     "vp9/encoder/x86/vp9_frame_scale_ssse3.c",
@@ -999,6 +1030,7 @@ libvpx_x86_64_c_srcs = [
     "vpx/src/vpx_decoder.c",
     "vpx/src/vpx_encoder.c",
     "vpx/src/vpx_image.c",
+    "vpx/src/vpx_tpl.c",
     "vpx_dsp/add_noise.c",
     "vpx_dsp/avg.c",
     "vpx_dsp/bitreader.c",
@@ -1015,6 +1047,7 @@ libvpx_x86_64_c_srcs = [
     "vpx_dsp/quantize.c",
     "vpx_dsp/sad.c",
     "vpx_dsp/skin_detection.c",
+    "vpx_dsp/sse.c",
     "vpx_dsp/subtract.c",
     "vpx_dsp/sum_squares.c",
     "vpx_dsp/variance.c",
diff --git a/CHANGELOG b/CHANGELOG
index f932f6bf4..6e39ca09e 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,80 @@
+2024-01-02 v1.14.0 "Venetian Duck"
+  This release drops support for old C compilers, such as Visual Studio 2012
+  and older, that disallow mixing variable declarations and statements (a C99
+  feature). It adds support for run-time CPU feature detection for Arm
+  platforms, as well as support for darwin23 (macOS 14).
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    Various new features for rate control library for real-time: SVC parallel
+    encoding, loopfilter level, support for frame dropping, and screen content.
+
+    New callback function send_tpl_gop_stats for vp9 external rate control
+    library, which can be used to transmit TPL stats for a group of pictures. A
+    public header vpx_tpl.h is added for the definition of TPL stats used in
+    this callback.
+
+    libwebm is upgraded to libwebm-1.0.0.29-9-g1930e3c.
+
+  - Enhancement:
+    Improvements on Neon optimizations: VoD: 12-35% speed up for bitdepth 8,
+    68%-151% speed up for high bitdepth.
+
+    Improvements on AVX2 and SSE optimizations.
+    Improvements on LSX optimizations for LoongArch.
+    42-49% speedup on speed 0 VoD encoding.
+    Android API level predicates.
+
+  - Bug fixes:
+    Fix to missing prototypes from the rtcd header.
+    Fix to segfault when total size is enlarged but width is smaller.
+    Fix to the build for arm64ec using MSVC.
+    Fix to copy BLOCK_8X8's mi to PICK_MODE_CONTEXT::mic.
+    Fix to -Wshadow warnings.
+    Fix to heap overflow in vpx_get4x4sse_cs_neon.
+    Fix to buffer overrun in highbd Neon subpel variance filters.
+    Added bitexact encode test script.
+    Fix to -Wl,-z,defs with Clang's sanitizers.
+    Fix to decoder stability after error & continued decoding.
+    Fix to mismatch of VP9 encode with NEON intrinsics with C only version.
+    Fix to Arm64 MSVC compile vpx_highbd_fdct4x4_neon.
+    Fix to fragments count before use.
+    Fix to a case where target bandwidth is 0 for SVC.
+    Fix mask in vp9_quantize_avx2,highbd_get_max_lane_eob.
+    Fix to int overflow in vp9_calc_pframe_target_size_one_pass_cbr.
+    Fix to integer overflow in vp8,ratectrl.c.
+    Fix to integer overflow in vp9 svc.
+    Fix to avg_frame_bandwidth overflow.
+    Fix to per frame qp for temporal layers.
+    Fix to unsigned integer overflow in sse computation.
+    Fix to uninitialized mesh feature for BEST mode.
+    Fix to overflow in highbd temporal_filter.
+    Fix to unaligned loads w/w==4 in vpx_convolve_copy_neon.
+    Skip arm64_neon.h workaround w/VS >= 2019.
+    Fix to c vs avx mismatch of diamond_search_sad().
+    Fix to c vs intrinsic mismatch of vpx_hadamard_32x32() function.
+    Fix to a bug in vpx_hadamard_32x32_neon().
+    Fix to Clang -Wunreachable-code-aggressive warnings.
+    Fix to a bug in vpx_highbd_hadamard_32x32_neon().
+    Fix to -Wunreachable-code in mfqe_partition.
+    Force mode search on 64x64 if no mode is selected.
+    Fix to ubsan failure caused by left shift of negative.
+    Fix to integer overflow in calc_pframe_target_size.
+    Fix to float-cast-overflow in vp8_change_config().
+    Fix to a null ptr before use.
+    Conditionally skip using inter frames in speed features.
+    Remove invalid reference frames.
+    Disable intra mode search speed features conditionally.
+    Set nonrd keyframe under dynamic change of deadline for rtc.
+    Fix to scaled reference offsets.
+    Set skip_recode=0 in nonrd_pick_sb_modes.
+    Fix to an edge case when downsizing to one.
+    Fix to a bug in frame scaling.
+    Fix to pred buffer stride.
+    Fix to a bug in simple motion search.
+    Update frame size in actual encoding.
+
 2023-09-29 v1.13.1 "Ugly Duckling"
   This release contains two security related fixes. One each for VP8 and VP9.
 
diff --git a/README b/README
index 444289a23..4c25b15d8 100644
--- a/README
+++ b/README
@@ -64,9 +64,16 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     arm64-android-gcc
     arm64-darwin-gcc
     arm64-darwin20-gcc
+    arm64-darwin21-gcc
+    arm64-darwin22-gcc
+    arm64-darwin23-gcc
     arm64-linux-gcc
     arm64-win64-gcc
     arm64-win64-vs15
+    arm64-win64-vs16
+    arm64-win64-vs16-clangcl
+    arm64-win64-vs17
+    arm64-win64-vs17-clangcl
     armv7-android-gcc
     armv7-darwin-gcc
     armv7-linux-rvct
@@ -75,8 +82,12 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     armv7-win32-gcc
     armv7-win32-vs14
     armv7-win32-vs15
+    armv7-win32-vs16
+    armv7-win32-vs17
     armv7s-darwin-gcc
     armv8-linux-gcc
+    loongarch32-linux-gcc
+    loongarch64-linux-gcc
     mips32-linux-gcc
     mips64-linux-gcc
     ppc64le-linux-gcc
@@ -117,6 +128,9 @@ COMPILING THE APPLICATIONS/LIBRARIES:
     x86_64-darwin18-gcc
     x86_64-darwin19-gcc
     x86_64-darwin20-gcc
+    x86_64-darwin21-gcc
+    x86_64-darwin22-gcc
+    x86_64-darwin23-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
diff --git a/args.c b/args.c
index 4afb9c021..0a9631e1f 100644
--- a/args.c
+++ b/args.c
@@ -135,7 +135,6 @@ unsigned int arg_parse_uint(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 int arg_parse_int(const struct arg *arg) {
@@ -152,7 +151,6 @@ int arg_parse_int(const struct arg *arg) {
   }
 
   die("Option %s: Invalid character '%c'\n", arg->name, *endptr);
-  return 0;
 }
 
 struct vpx_rational {
@@ -209,7 +207,6 @@ int arg_parse_enum(const struct arg *arg) {
     if (!strcmp(arg->val, listptr->name)) return listptr->val;
 
   die("Option %s: Invalid value '%s'\n", arg->name, arg->val);
-  return 0;
 }
 
 int arg_parse_enum_or_int(const struct arg *arg) {
diff --git a/build/make/Makefile b/build/make/Makefile
index 5c38c18e5..199ed7805 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -143,6 +143,14 @@ $(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
 $(BUILD_PFX)%_avx512.c.d: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 $(BUILD_PFX)%_avx512.c.o: CFLAGS += -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl
 
+# AARCH64
+$(BUILD_PFX)%_neon_dotprod.c.d: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_dotprod.c.o: CFLAGS += -march=armv8.2-a+dotprod
+$(BUILD_PFX)%_neon_i8mm.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_neon_i8mm.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm
+$(BUILD_PFX)%_sve.c.d: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+$(BUILD_PFX)%_sve.c.o: CFLAGS += -march=armv8.2-a+dotprod+i8mm+sve
+
 # POWER
 $(BUILD_PFX)%_vsx.c.d: CFLAGS += -maltivec -mvsx
 $(BUILD_PFX)%_vsx.c.o: CFLAGS += -maltivec -mvsx
@@ -304,6 +312,19 @@ $(1):
 	$(qexec)$$(AR) $$(ARFLAGS) $$@ $$^
 endef
 
+# Don't use -Wl,-z,defs with Clang's sanitizers.
+#
+# Clang's AddressSanitizer documentation says "When linking shared libraries,
+# the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link
+# errors (don't use it with AddressSanitizer)." See
+# https://clang.llvm.org/docs/AddressSanitizer.html#usage.
+NO_UNDEFINED := -Wl,-z,defs
+ifeq ($(findstring clang,$(CC)),clang)
+    ifneq ($(filter -fsanitize=%,$(LDFLAGS)),)
+        NO_UNDEFINED :=
+    endif
+endif
+
 define so_template
 # Not using a pattern rule here because we don't want to generate empty
 # archives when they are listed as a dependency in files not responsible
@@ -313,7 +334,8 @@ define so_template
 $(1):
 	$(if $(quiet),@echo "    [LD] $$@")
 	$(qexec)$$(LD) -shared $$(LDFLAGS) \
-            -Wl,--no-undefined -Wl,-soname,$$(SONAME) \
+            $(NO_UNDEFINED) \
+            -Wl,-soname,$$(SONAME) \
             -Wl,--version-script,$$(EXPORTS_FILE) -o $$@ \
             $$(filter %.o,$$^) $$(extralibs)
 endef
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 4bf090f00..b645a666f 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -521,6 +521,7 @@ AS_SFX    = ${AS_SFX:-.asm}
 EXE_SFX   = ${EXE_SFX}
 VCPROJ_SFX = ${VCPROJ_SFX}
 RTCD_OPTIONS = ${RTCD_OPTIONS}
+LIBWEBM_CXXFLAGS = ${LIBWEBM_CXXFLAGS}
 LIBYUV_CXXFLAGS = ${LIBYUV_CXXFLAGS}
 EOF
 
@@ -791,7 +792,7 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin2[0-2]*)
+      *darwin2[0-3]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
@@ -842,6 +843,10 @@ process_common_toolchain() {
 
   # Enable the architecture family
   case ${tgt_isa} in
+    arm64 | armv8)
+      enable_feature arm
+      enable_feature aarch64
+      ;;
     arm*)
       enable_feature arm
       ;;
@@ -858,8 +863,14 @@ process_common_toolchain() {
       ;;
   esac
 
-  # PIC is probably what we want when building shared libs
+  # Position independent code (PIC) is probably what we want when building
+  # shared libs or position independent executable (PIE) targets.
   enabled shared && soft_enable pic
+  check_cpp << EOF || soft_enable pic
+#if !(__pie__ || __PIE__)
+#error Neither __pie__ or __PIE__ are set
+#endif
+EOF
 
   # Minimum iOS version for all target platforms (darwin and iphonesimulator).
   # Shared library framework builds are only possible on iOS 8 and later.
@@ -940,7 +951,7 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin2[0-2]-*)
+    *-darwin2[0-3]-*)
       add_cflags  "-arch ${toolchain%%-*}"
       add_ldflags "-arch ${toolchain%%-*}"
       ;;
@@ -965,13 +976,26 @@ process_common_toolchain() {
       ;;
   esac
 
-  # Process ARM architecture variants
+  # Process architecture variants
   case ${toolchain} in
     arm*)
-      # on arm, isa versions are supersets
+      soft_enable runtime_cpu_detect
+      # Arm ISA extensions are treated as supersets.
       case ${tgt_isa} in
         arm64|armv8)
-          soft_enable neon
+          for ext in ${ARCH_EXT_LIST_AARCH64}; do
+            # Disable higher order extensions to simplify dependencies.
+            if [ "$disable_exts" = "yes" ]; then
+              if ! disabled $ext; then
+                RTCD_OPTIONS="${RTCD_OPTIONS}--disable-${ext} "
+                disable_feature $ext
+              fi
+            elif disabled $ext; then
+              disable_exts="yes"
+            else
+              soft_enable $ext
+            fi
+          done
           ;;
         armv7|armv7s)
           soft_enable neon
@@ -1066,8 +1090,11 @@ EOF
                     enable_feature win_arm64_neon_h_workaround
               else
                 # If a probe is not possible, assume this is the pure Windows
-                # SDK and so the workaround is necessary.
-                enable_feature win_arm64_neon_h_workaround
+                # SDK and so the workaround is necessary when using Visual
+                # Studio < 2019.
+                if [ ${tgt_cc##vs} -lt 16 ]; then
+                  enable_feature win_arm64_neon_h_workaround
+                fi
               fi
             fi
           fi
diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 58bb66b9e..1e1db05bb 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh
@@ -141,7 +141,17 @@ for opt in "$@"; do
     case "$opt" in
         --help|-h) show_help
         ;;
-        --target=*) target="${optval}"
+        --target=*)
+            target="${optval}"
+            platform_toolset=$(echo ${target} | awk 'BEGIN{FS="-"}{print $4}')
+            case "$platform_toolset" in
+                clangcl) platform_toolset="ClangCl"
+                ;;
+                "")
+                ;;
+                *) die Unrecognized Visual Studio Platform Toolset in $opt
+                ;;
+            esac
         ;;
         --out=*) outfile="$optval"
         ;;
@@ -259,6 +269,10 @@ case "$target" in
     ;;
     arm64*)
         platforms[0]="ARM64"
+        # As of Visual Studio 2022 17.5.5, clang-cl does not support ARM64EC.
+        if [ "$vs_ver" -ge 17 -a "$platform_toolset" != "ClangCl" ]; then
+            platforms[1]="ARM64EC"
+        fi
         asm_Debug_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
         asm_Release_cmdline="armasm64 -nologo -oldit &quot;%(FullPath)&quot;"
     ;;
@@ -335,17 +349,21 @@ generate_vcxproj() {
             else
                 tag_content ConfigurationType StaticLibrary
             fi
-            if [ "$vs_ver" = "14" ]; then
-                tag_content PlatformToolset v140
-            fi
-            if [ "$vs_ver" = "15" ]; then
-                tag_content PlatformToolset v141
-            fi
-            if [ "$vs_ver" = "16" ]; then
-                tag_content PlatformToolset v142
-            fi
-            if [ "$vs_ver" = "17" ]; then
-                tag_content PlatformToolset v143
+            if [ -n "$platform_toolset" ]; then
+                tag_content PlatformToolset "$platform_toolset"
+            else
+                if [ "$vs_ver" = "14" ]; then
+                    tag_content PlatformToolset v140
+                fi
+                if [ "$vs_ver" = "15" ]; then
+                    tag_content PlatformToolset v141
+                fi
+                if [ "$vs_ver" = "16" ]; then
+                    tag_content PlatformToolset v142
+                fi
+                if [ "$vs_ver" = "17" ]; then
+                    tag_content PlatformToolset v143
+                fi
             fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index f4edeaad5..0b9e16738 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -487,7 +487,7 @@ if ($opts{arch} eq 'x86') {
   @ALL_ARCHS = filter(qw/neon_asm neon/);
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
+  @ALL_ARCHS = filter(qw/neon neon_dotprod neon_i8mm sve/);
   @REQUIRES = filter(qw/neon/);
   &require(@REQUIRES);
   arm;
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index b2b2fc2dc..cb7c1948a 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -21,7 +21,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -32,13 +34,15 @@ extern "C" {
 #endif
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_neon
 
 int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_c
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_neon
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_neon
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -57,17 +61,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
-#define vp9_highbd_block_error vp9_highbd_block_error_c
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_neon
 
 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_neon
 
 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
 
 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_neon
 
 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
@@ -84,12 +91,12 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int str
 void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -107,12 +114,12 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
index 2ccf56fde..6932c6eca 100644
--- a/config/arm-neon/vpx_config.asm
+++ b/config/arm-neon/vpx_config.asm
@@ -2,13 +2,17 @@
 @  using the ads2gas.pl script.
 .syntax unified
 .equ VPX_ARCH_ARM ,  1
+.equ VPX_ARCH_AARCH64 ,  0
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
 .equ VPX_ARCH_LOONGARCH ,  0
-.equ HAVE_NEON ,  1
 .equ HAVE_NEON_ASM ,  1
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
@@ -78,7 +82,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  0
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@@ -91,4 +94,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
     .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h
index 3fa6606ab..ae183d7d6 100644
--- a/config/arm-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h
@@ -11,13 +11,17 @@
 #define RESTRICT    
 #define INLINE      inline
 #define VPX_ARCH_ARM 1
+#define VPX_ARCH_AARCH64 0
 #define VPX_ARCH_MIPS 0
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
 #define VPX_ARCH_LOONGARCH 0
-#define HAVE_NEON 1
 #define HAVE_NEON_ASM 1
+#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -87,7 +91,6 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -100,6 +103,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 4096
 #define DECODE_HEIGHT_LIMIT 3072
 #endif /* VPX_CONFIG_H */
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index 565105892..578f1c5dc 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -15,6 +15,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -66,16 +70,20 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *d
 #define vpx_convolve_copy vpx_convolve_copy_neon
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon
 
 void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon
 
 void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon
 
 void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -94,28 +102,36 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon
 
 void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon
 
 void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon
 
 void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon
 
 void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon
 
 void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon
 
 void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon
 
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon
 
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -137,16 +153,20 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *abo
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon
 
 void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon
 
 void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon
 
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon
 
 void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
@@ -838,10 +858,12 @@ unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_strid
 #define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
 
 unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
-#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+unsigned int vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_neon
 
 unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
-#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+unsigned int vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_neon
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
 void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
@@ -880,16 +902,20 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, ui
 #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
 
 void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_neon
 
 void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_neon
 
 void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_neon
 
 void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_neon
 
 void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -908,28 +934,36 @@ void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const u
 #define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
 
 void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_neon
 
 void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_neon
 
 void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_neon
 
 void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_neon
 
 void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_neon
 
 void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_neon
 
 void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_neon
 
 void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_neon
 
 void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -948,16 +982,20 @@ void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const ui
 #define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
 
 void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_neon
 
 void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_neon
 
 void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_neon
 
 void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_neon
 
 void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -1072,13 +1110,16 @@ void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
 
 void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_c
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_neon
 
 void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_c
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_neon
 
 void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_c
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_neon
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
 void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
@@ -1187,14 +1228,15 @@ void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *
 #define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
 
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
-#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_neon
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1205,8 +1247,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1217,8 +1259,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1229,8 +1271,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1241,8 +1283,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1253,8 +1295,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1265,8 +1307,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1277,8 +1319,8 @@ unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1289,8 +1331,8 @@ unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1301,8 +1343,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1313,8 +1355,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1325,8 +1367,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1337,8 +1379,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1349,12 +1391,121 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
 
+unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_neon
+
+void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_neon
+
+void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_neon
+
+void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_neon
+
+void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_neon
+
+void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_neon
+
+void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_neon
+
+unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_neon
+
+void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_neon
+
+unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_neon
+
+void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_neon
+
+unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_neon
+
+void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_neon
+
+void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_neon
+
+void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_neon
+
+void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_neon
+
+void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_neon
+
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
-#define vpx_highbd_satd vpx_highbd_satd_c
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length);
+#define vpx_highbd_satd vpx_highbd_satd_neon
+
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+int64_t vpx_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+#define vpx_highbd_sse vpx_highbd_sse_neon
 
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
 void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
@@ -1515,20 +1666,23 @@ unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define vpx_mse16x16 vpx_mse16x16_neon
 
 unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_c
+unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_neon
 
 unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse8x16 vpx_mse8x16_c
+unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_neon
 
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse8x8 vpx_mse8x8_c
+unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_neon
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_neon
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1539,8 +1693,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_neon
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1551,8 +1705,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1563,8 +1717,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1575,8 +1729,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1587,8 +1741,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1599,8 +1753,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1611,8 +1765,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1623,8 +1777,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1635,8 +1789,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1647,8 +1801,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1659,8 +1813,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1671,8 +1825,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1683,10 +1837,114 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_neon
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_neon
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_neon
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_neon
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_neon
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_neon
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_neon
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_neon
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_neon
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_neon
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_neon
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_neon
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_neon
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_neon
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_neon
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_neon
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon
+
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
@@ -1710,6 +1968,10 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+int64_t vpx_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+#define vpx_sse vpx_sse_neon
+
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index fa2cc50fd..00ab40fe2 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  13
+#define VERSION_MINOR  14
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "1559-gcd7dbca207"
+#define VERSION_EXTRA  "1616-g26104bbc9d"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
-#define VERSION_STRING      " v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d"
+#define VERSION_STRING      " v1.14.0-1616-g26104bbc9d"
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index b2b2fc2dc..cb7c1948a 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -21,7 +21,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -32,13 +34,15 @@ extern "C" {
 #endif
 
 int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
-#define vp9_block_error vp9_block_error_c
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz);
+#define vp9_block_error vp9_block_error_neon
 
 int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
-#define vp9_block_error_fp vp9_block_error_fp_c
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
+#define vp9_block_error_fp vp9_block_error_fp_neon
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_neon
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -57,17 +61,20 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
 
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
-#define vp9_highbd_block_error vp9_highbd_block_error_c
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd);
+#define vp9_highbd_block_error vp9_highbd_block_error_neon
 
 void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht16x16 vp9_highbd_fht16x16_neon
 
 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
 
 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht8x8 vp9_highbd_fht8x8_neon
 
 void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_highbd_fwht4x4 vp9_highbd_fwht4x4_c
@@ -84,12 +91,12 @@ void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int str
 void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -107,12 +114,12 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
index c4b840b89..f23e27fe7 100644
--- a/config/arm64/vpx_config.asm
+++ b/config/arm64/vpx_config.asm
@@ -2,13 +2,17 @@
 @  using the ads2gas.pl script.
 .syntax unified
 .equ VPX_ARCH_ARM ,  1
+.equ VPX_ARCH_AARCH64 ,  1
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
 .equ VPX_ARCH_LOONGARCH ,  0
-.equ HAVE_NEON ,  1
 .equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  1
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
@@ -78,7 +82,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  0
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@@ -91,4 +94,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
     .section .note.GNU-stack,"",%progbits
diff --git a/config/arm64/vpx_config.c b/config/arm64/vpx_config.c
index 13490c81c..d9a44071c 100644
--- a/config/arm64/vpx_config.c
+++ b/config/arm64/vpx_config.c
@@ -6,5 +6,5 @@
 /* in the file PATENTS.  All contributing project authors may */
 /* be found in the AUTHORS file in the root of the source tree. */
 #include "vpx/vpx_codec.h"
-static const char* const cfg = "--force-target=armv8-linux-gcc --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
+static const char* const cfg = "--target=armv8-linux-gcc --disable-neon_dotprod --disable-neon_i8mm --enable-external-build --enable-realtime-only --enable-pic --disable-runtime-cpu-detect --disable-install-docs --size-limit=4096x3072 --enable-vp9-highbitdepth";
 const char *vpx_codec_build_config(void) {return cfg;}
diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h
index 247c0ea6f..03f681712 100644
--- a/config/arm64/vpx_config.h
+++ b/config/arm64/vpx_config.h
@@ -11,13 +11,17 @@
 #define RESTRICT    
 #define INLINE      inline
 #define VPX_ARCH_ARM 1
+#define VPX_ARCH_AARCH64 1
 #define VPX_ARCH_MIPS 0
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
 #define VPX_ARCH_LOONGARCH 0
-#define HAVE_NEON 1
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 1
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -87,7 +91,6 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -100,6 +103,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 4096
 #define DECODE_HEIGHT_LIMIT 3072
 #endif /* VPX_CONFIG_H */
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index 565105892..578f1c5dc 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -15,6 +15,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -66,16 +70,20 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *d
 #define vpx_convolve_copy vpx_convolve_copy_neon
 
 void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_neon
 
 void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_neon
 
 void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_neon
 
 void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_neon
 
 void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -94,28 +102,36 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *
 #define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon
 
 void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_neon
 
 void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_neon
 
 void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_neon
 
 void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_neon
 
 void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_neon
 
 void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_neon
 
 void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_neon
 
 void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_neon
 
 void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
@@ -137,16 +153,20 @@ void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *abo
 #define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c
 
 void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_neon
 
 void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_neon
 
 void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_neon
 
 void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
-#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
+#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_neon
 
 void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left);
 #define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c
@@ -838,10 +858,12 @@ unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_strid
 #define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
 
 unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
-#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
+unsigned int vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p);
+#define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_neon
 
 unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
-#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
+unsigned int vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p);
+#define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_neon
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
 void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
@@ -880,16 +902,20 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, ui
 #define vpx_highbd_convolve_copy vpx_highbd_convolve_copy_neon
 
 void vpx_highbd_d117_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_c
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_16x16 vpx_highbd_d117_predictor_16x16_neon
 
 void vpx_highbd_d117_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_c
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_32x32 vpx_highbd_d117_predictor_32x32_neon
 
 void vpx_highbd_d117_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_c
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_4x4 vpx_highbd_d117_predictor_4x4_neon
 
 void vpx_highbd_d117_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_c
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d117_predictor_8x8 vpx_highbd_d117_predictor_8x8_neon
 
 void vpx_highbd_d135_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -908,28 +934,36 @@ void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const u
 #define vpx_highbd_d135_predictor_8x8 vpx_highbd_d135_predictor_8x8_neon
 
 void vpx_highbd_d153_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_c
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_16x16 vpx_highbd_d153_predictor_16x16_neon
 
 void vpx_highbd_d153_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_c
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_32x32 vpx_highbd_d153_predictor_32x32_neon
 
 void vpx_highbd_d153_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_c
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_4x4 vpx_highbd_d153_predictor_4x4_neon
 
 void vpx_highbd_d153_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_c
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d153_predictor_8x8 vpx_highbd_d153_predictor_8x8_neon
 
 void vpx_highbd_d207_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_c
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_16x16 vpx_highbd_d207_predictor_16x16_neon
 
 void vpx_highbd_d207_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_c
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_32x32 vpx_highbd_d207_predictor_32x32_neon
 
 void vpx_highbd_d207_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_c
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_4x4 vpx_highbd_d207_predictor_4x4_neon
 
 void vpx_highbd_d207_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_c
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d207_predictor_8x8 vpx_highbd_d207_predictor_8x8_neon
 
 void vpx_highbd_d45_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -948,16 +982,20 @@ void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const ui
 #define vpx_highbd_d45_predictor_8x8 vpx_highbd_d45_predictor_8x8_neon
 
 void vpx_highbd_d63_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_c
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_16x16 vpx_highbd_d63_predictor_16x16_neon
 
 void vpx_highbd_d63_predictor_32x32_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_c
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_32x32 vpx_highbd_d63_predictor_32x32_neon
 
 void vpx_highbd_d63_predictor_4x4_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_c
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_4x4 vpx_highbd_d63_predictor_4x4_neon
 
 void vpx_highbd_d63_predictor_8x8_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
-#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_c
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
+#define vpx_highbd_d63_predictor_8x8 vpx_highbd_d63_predictor_8x8_neon
 
 void vpx_highbd_dc_128_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
@@ -1072,13 +1110,16 @@ void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const uint
 #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_neon
 
 void vpx_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_c
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_16x16 vpx_highbd_hadamard_16x16_neon
 
 void vpx_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_c
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_32x32 vpx_highbd_hadamard_32x32_neon
 
 void vpx_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_c
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_highbd_hadamard_8x8 vpx_highbd_hadamard_8x8_neon
 
 void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint16_t *dest, int stride, int bd);
 void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd);
@@ -1187,14 +1228,15 @@ void vpx_highbd_lpf_vertical_8_dual_neon(uint16_t *s, int pitch, const uint8_t *
 #define vpx_highbd_lpf_vertical_8_dual vpx_highbd_lpf_vertical_8_dual_neon
 
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
-#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
+#define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_neon
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1205,8 +1247,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1217,8 +1259,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1229,8 +1271,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1241,8 +1283,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1253,8 +1295,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1265,8 +1307,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1277,8 +1319,8 @@ unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1289,8 +1331,8 @@ unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1301,8 +1343,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1313,8 +1355,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1325,8 +1367,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1337,8 +1379,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1349,12 +1391,121 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
 
+unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_neon
+
+void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_neon
+
+void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_neon
+
+void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_neon
+
+void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_neon
+
+void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_neon
+
+void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_neon
+
+unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_neon
+
+void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_neon
+
+unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_neon
+
+void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_neon
+
+unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_neon
+
+void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_neon
+
+unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_neon
+
+void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_neon
+
+void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_neon
+
+void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_neon
+
+unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_neon
+
+void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_neon
+
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
-#define vpx_highbd_satd vpx_highbd_satd_c
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length);
+#define vpx_highbd_satd vpx_highbd_satd_neon
+
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+int64_t vpx_highbd_sse_neon(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+#define vpx_highbd_sse vpx_highbd_sse_neon
 
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
 void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
@@ -1515,20 +1666,23 @@ unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uin
 #define vpx_mse16x16 vpx_mse16x16_neon
 
 unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse16x8 vpx_mse16x8_c
+unsigned int vpx_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse16x8 vpx_mse16x8_neon
 
 unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse8x16 vpx_mse8x16_c
+unsigned int vpx_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x16 vpx_mse8x16_neon
 
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_mse8x8 vpx_mse8x8_c
+unsigned int vpx_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_mse8x8 vpx_mse8x8_neon
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_neon
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1539,8 +1693,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_neon
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1551,8 +1705,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1563,8 +1717,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1575,8 +1729,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1587,8 +1741,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1599,8 +1753,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1611,8 +1765,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1623,8 +1777,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1635,8 +1789,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1647,8 +1801,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1659,8 +1813,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1671,8 +1825,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1683,10 +1837,114 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_neon
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_neon
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_neon
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_neon
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_neon
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_neon
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_neon
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_neon
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_neon
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_neon
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_neon
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_neon
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_neon
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_neon
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_neon
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_neon
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_neon
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_neon
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_neon
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_neon
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_neon
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_neon
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_neon
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_neon
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_neon
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_neon
+
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
@@ -1710,6 +1968,10 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+int64_t vpx_sse_neon(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+#define vpx_sse vpx_sse_neon
+
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_neon
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index fa2cc50fd..00ab40fe2 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  13
+#define VERSION_MINOR  14
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "1559-gcd7dbca207"
+#define VERSION_EXTRA  "1616-g26104bbc9d"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
-#define VERSION_STRING      " v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d"
+#define VERSION_STRING      " v1.14.0-1616-g26104bbc9d"
diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h
index 07d24536d..d3379b6dd 100644
--- a/config/generic/vp9_rtcd.h
+++ b/config/generic/vp9_rtcd.h
@@ -21,7 +21,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -37,7 +39,7 @@ int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, in
 int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_c
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -76,10 +78,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -94,10 +96,10 @@ void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_c
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
index ffeb85ebb..f14048b69 100644
--- a/config/generic/vpx_config.asm
+++ b/config/generic/vpx_config.asm
@@ -2,13 +2,17 @@
 @  using the ads2gas.pl script.
 .syntax unified
 .equ VPX_ARCH_ARM ,  0
+.equ VPX_ARCH_AARCH64 ,  0
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
 .equ VPX_ARCH_LOONGARCH ,  0
-.equ HAVE_NEON ,  0
 .equ HAVE_NEON_ASM ,  0
+.equ HAVE_NEON ,  0
+.equ HAVE_NEON_DOTPROD ,  0
+.equ HAVE_NEON_I8MM ,  0
+.equ HAVE_SVE ,  0
 .equ HAVE_MIPS32 ,  0
 .equ HAVE_DSPR2 ,  0
 .equ HAVE_MSA ,  0
@@ -78,7 +82,6 @@
 .equ CONFIG_MULTI_RES_ENCODING ,  0
 .equ CONFIG_TEMPORAL_DENOISING ,  1
 .equ CONFIG_VP9_TEMPORAL_DENOISING ,  0
-.equ CONFIG_CONSISTENT_RECODE ,  0
 .equ CONFIG_COEFFICIENT_RANGE_CHECKING ,  0
 .equ CONFIG_VP9_HIGHBITDEPTH ,  1
 .equ CONFIG_BETTER_HW_COMPATIBILITY ,  0
@@ -91,4 +94,5 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
+.equ CONFIG_COLLECT_COMPONENT_TIMING ,  0
     .section .note.GNU-stack,"",%progbits
diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h
index c9d8393c8..bade04289 100644
--- a/config/generic/vpx_config.h
+++ b/config/generic/vpx_config.h
@@ -11,13 +11,17 @@
 #define RESTRICT    
 #define INLINE      inline
 #define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
 #define VPX_ARCH_MIPS 0
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
 #define VPX_ARCH_LOONGARCH 0
-#define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -87,7 +91,6 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -100,6 +103,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 4096
 #define DECODE_HEIGHT_LIMIT 3072
 #endif /* VPX_CONFIG_H */
diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
index 328601f76..256cbdfa5 100644
--- a/config/generic/vpx_dsp_rtcd.h
+++ b/config/generic/vpx_dsp_rtcd.h
@@ -15,6 +15,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -930,10 +934,10 @@ void vpx_highbd_lpf_vertical_8_dual_c(uint16_t *s, int pitch, const uint8_t *bli
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -942,7 +946,7 @@ unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -951,7 +955,7 @@ unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -960,7 +964,7 @@ unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -969,7 +973,7 @@ unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -978,7 +982,7 @@ unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -987,7 +991,7 @@ unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -996,7 +1000,7 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1005,7 +1009,7 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1014,7 +1018,7 @@ unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1023,7 +1027,7 @@ unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1032,7 +1036,7 @@ unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1041,7 +1045,7 @@ unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1050,12 +1054,93 @@ unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
 
+unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_c
+
+void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_c
+
+unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_c
+
+void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_c
+
+unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_c
+
+void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_c
+
+unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_c
+
+void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_c
+
+unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_c
+
+void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_c
+
+unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_c
+
+void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_c
+
+unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c
+
+void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c
+
+void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_c
+
+unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_c
+
+void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_c
+
+unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_c
+
+void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_c
+
+unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_c
+
+void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_c
+
+unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c
+
+void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_c
+
+void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_c
+
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
 #define vpx_highbd_satd vpx_highbd_satd_c
 
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+#define vpx_highbd_sse vpx_highbd_sse_c
+
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
 #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
 
@@ -1185,10 +1270,10 @@ unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_c
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_c
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1197,7 +1282,7 @@ unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_c
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_c
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1206,7 +1291,7 @@ unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_c
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_c
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1215,7 +1300,7 @@ unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_c
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_c
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1224,7 +1309,7 @@ unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_c
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_c
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1233,7 +1318,7 @@ unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1242,7 +1327,7 @@ unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_c
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_c
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1251,7 +1336,7 @@ unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_c
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_c
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1260,7 +1345,7 @@ unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_c
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1269,7 +1354,7 @@ unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_c
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_c
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1278,7 +1363,7 @@ unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1287,7 +1372,7 @@ unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_c
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_c
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1296,7 +1381,7 @@ unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_c
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1305,9 +1390,87 @@ unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_c
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_c
 
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_c
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_c
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_c
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_c
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_c
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_c
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_c
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_c
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_c
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_c
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_c
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_c
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_c
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_c
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_c
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_c
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_c
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_c
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_c
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_c
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_c
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_c
+
 int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
@@ -1329,6 +1492,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 #define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c
 
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index fa2cc50fd..00ab40fe2 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  13
+#define VERSION_MINOR  14
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "1559-gcd7dbca207"
+#define VERSION_EXTRA  "1616-g26104bbc9d"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
-#define VERSION_STRING      " v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d"
+#define VERSION_STRING      " v1.14.0-1616-g26104bbc9d"
diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
index 5f7b32673..99ea7c278 100644
--- a/config/x86/vp8_rtcd.h
+++ b/config/x86/vp8_rtcd.h
@@ -45,15 +45,6 @@ void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_lin
 void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3
 
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
-
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index 580d55a28..bd3233e20 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -21,7 +21,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -39,7 +41,7 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_sse2
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -83,10 +85,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -104,13 +106,13 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_ssse3
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
index 8c108efe9..360909d1e 100644
--- a/config/x86/vpx_config.asm
+++ b/config/x86/vpx_config.asm
@@ -1,11 +1,15 @@
 %define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
 %define VPX_ARCH_MIPS 0
 %define VPX_ARCH_X86 1
 %define VPX_ARCH_X86_64 0
 %define VPX_ARCH_PPC 0
 %define VPX_ARCH_LOONGARCH 0
-%define HAVE_NEON 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -75,7 +79,6 @@
 %define CONFIG_MULTI_RES_ENCODING 0
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 1
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -88,3 +91,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h
index 6cc7eda34..a7f41a9ae 100644
--- a/config/x86/vpx_config.h
+++ b/config/x86/vpx_config.h
@@ -11,13 +11,17 @@
 #define RESTRICT    
 #define INLINE      inline
 #define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
 #define VPX_ARCH_MIPS 0
 #define VPX_ARCH_X86 1
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
 #define VPX_ARCH_LOONGARCH 0
-#define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -87,7 +91,6 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -100,6 +103,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 4096
 #define DECODE_HEIGHT_LIMIT 3072
 #endif /* VPX_CONFIG_H */
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index 91242deee..67c150490 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -15,6 +15,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -1185,12 +1189,12 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1201,8 +1205,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1213,8 +1217,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1225,8 +1229,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1237,8 +1241,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1249,8 +1253,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1261,8 +1265,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1271,8 +1275,8 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1281,8 +1285,8 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1293,8 +1297,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1305,8 +1309,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1317,8 +1321,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1329,8 +1333,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1341,13 +1345,115 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
 
+unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_sse2
+
+void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_sse2
+
+void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_sse2
+
+void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_sse2
+
+void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_sse2
+
+void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_sse2
+
+void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c
+
+void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c
+
+void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_sse2
+
+void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_sse2
+
+void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_sse2
+
+void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c
+
+void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_sse2
+
+void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_sse2
+
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
 #define vpx_highbd_satd vpx_highbd_satd_c
 
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+#define vpx_highbd_sse vpx_highbd_sse_c
+
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
 #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
 
@@ -1537,13 +1643,13 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_ssse3
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1554,8 +1660,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1566,8 +1672,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1578,8 +1684,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1590,8 +1696,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1602,8 +1708,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1614,8 +1720,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1626,8 +1732,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1638,8 +1744,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1650,8 +1756,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1662,8 +1768,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1674,8 +1780,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1686,8 +1792,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1698,10 +1804,110 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_sse2
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_sse2
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_sse2
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_sse2
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_sse2
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_sse2
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_sse2
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_sse2
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_sse2
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_sse2
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2
+
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
@@ -1725,6 +1931,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index fa2cc50fd..00ab40fe2 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  13
+#define VERSION_MINOR  14
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "1559-gcd7dbca207"
+#define VERSION_EXTRA  "1616-g26104bbc9d"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
-#define VERSION_STRING      " v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d"
+#define VERSION_STRING      " v1.14.0-1616-g26104bbc9d"
diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
index 5f7b32673..99ea7c278 100644
--- a/config/x86_64/vp8_rtcd.h
+++ b/config/x86_64/vp8_rtcd.h
@@ -45,15 +45,6 @@ void vp8_bilinear_predict8x8_sse2(unsigned char *src_ptr, int src_pixels_per_lin
 void vp8_bilinear_predict8x8_ssse3(unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch);
 #define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_ssse3
 
-void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_b vp8_blend_b_c
-
-void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_mb_inner vp8_blend_mb_inner_c
-
-void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride);
-#define vp8_blend_mb_outer vp8_blend_mb_outer_c
-
 int vp8_block_error_c(short *coeff, short *dqcoeff);
 int vp8_block_error_sse2(short *coeff, short *dqcoeff);
 #define vp8_block_error vp8_block_error_sse2
diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h
index 580d55a28..bd3233e20 100644
--- a/config/x86_64/vp9_rtcd.h
+++ b/config/x86_64/vp9_rtcd.h
@@ -21,7 +21,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -39,7 +41,7 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 int64_t vp9_block_error_fp_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size);
 #define vp9_block_error_fp vp9_block_error_fp_sse2
 
-int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv);
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -83,10 +85,10 @@ void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint16_t *dest, int str
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -104,13 +106,13 @@ void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp vp9_quantize_fp_ssse3
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
index fdc51d32e..d0161162e 100644
--- a/config/x86_64/vpx_config.asm
+++ b/config/x86_64/vpx_config.asm
@@ -1,11 +1,15 @@
 %define VPX_ARCH_ARM 0
+%define VPX_ARCH_AARCH64 0
 %define VPX_ARCH_MIPS 0
 %define VPX_ARCH_X86 0
 %define VPX_ARCH_X86_64 1
 %define VPX_ARCH_PPC 0
 %define VPX_ARCH_LOONGARCH 0
-%define HAVE_NEON 0
 %define HAVE_NEON_ASM 0
+%define HAVE_NEON 0
+%define HAVE_NEON_DOTPROD 0
+%define HAVE_NEON_I8MM 0
+%define HAVE_SVE 0
 %define HAVE_MIPS32 0
 %define HAVE_DSPR2 0
 %define HAVE_MSA 0
@@ -75,7 +79,6 @@
 %define CONFIG_MULTI_RES_ENCODING 0
 %define CONFIG_TEMPORAL_DENOISING 1
 %define CONFIG_VP9_TEMPORAL_DENOISING 0
-%define CONFIG_CONSISTENT_RECODE 0
 %define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 %define CONFIG_VP9_HIGHBITDEPTH 1
 %define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -88,3 +91,4 @@
 %define CONFIG_EMULATE_HARDWARE 0
 %define CONFIG_NON_GREEDY_MV 0
 %define CONFIG_RATE_CTRL 0
+%define CONFIG_COLLECT_COMPONENT_TIMING 0
diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h
index c624a9f8e..f33cb37d3 100644
--- a/config/x86_64/vpx_config.h
+++ b/config/x86_64/vpx_config.h
@@ -11,13 +11,17 @@
 #define RESTRICT    
 #define INLINE      inline
 #define VPX_ARCH_ARM 0
+#define VPX_ARCH_AARCH64 0
 #define VPX_ARCH_MIPS 0
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 1
 #define VPX_ARCH_PPC 0
 #define VPX_ARCH_LOONGARCH 0
-#define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
+#define HAVE_NEON 0
+#define HAVE_NEON_DOTPROD 0
+#define HAVE_NEON_I8MM 0
+#define HAVE_SVE 0
 #define HAVE_MIPS32 0
 #define HAVE_DSPR2 0
 #define HAVE_MSA 0
@@ -87,7 +91,6 @@
 #define CONFIG_MULTI_RES_ENCODING 0
 #define CONFIG_TEMPORAL_DENOISING 1
 #define CONFIG_VP9_TEMPORAL_DENOISING 0
-#define CONFIG_CONSISTENT_RECODE 0
 #define CONFIG_COEFFICIENT_RANGE_CHECKING 0
 #define CONFIG_VP9_HIGHBITDEPTH 1
 #define CONFIG_BETTER_HW_COMPATIBILITY 0
@@ -100,6 +103,7 @@
 #define CONFIG_EMULATE_HARDWARE 0
 #define CONFIG_NON_GREEDY_MV 0
 #define CONFIG_RATE_CTRL 0
+#define CONFIG_COLLECT_COMPONENT_TIMING 0
 #define DECODE_WIDTH_LIMIT 4096
 #define DECODE_HEIGHT_LIMIT 3072
 #endif /* VPX_CONFIG_H */
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 22401f1c0..5eb512172 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -15,6 +15,10 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 
 #ifdef __cplusplus
@@ -1192,12 +1196,12 @@ void vpx_highbd_lpf_vertical_8_dual_sse2(uint16_t *s, int pitch, const uint8_t *
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1208,8 +1212,8 @@ unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1220,8 +1224,8 @@ unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1232,8 +1236,8 @@ unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1244,8 +1248,8 @@ unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1256,8 +1260,8 @@ unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1268,8 +1272,8 @@ unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1278,8 +1282,8 @@ unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1288,8 +1292,8 @@ unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const u
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1300,8 +1304,8 @@ unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1312,8 +1316,8 @@ unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, c
 unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1324,8 +1328,8 @@ unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, co
 unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1336,8 +1340,8 @@ unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1348,13 +1352,115 @@ unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, con
 unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
 
+unsigned int vpx_highbd_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x16 vpx_highbd_sad_skip_16x16_sse2
+
+void vpx_highbd_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x16x4d vpx_highbd_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x32 vpx_highbd_sad_skip_16x32_sse2
+
+void vpx_highbd_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x32x4d vpx_highbd_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_16x8 vpx_highbd_sad_skip_16x8_sse2
+
+void vpx_highbd_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_16x8x4d vpx_highbd_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x16 vpx_highbd_sad_skip_32x16_sse2
+
+void vpx_highbd_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x16x4d vpx_highbd_sad_skip_32x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x32 vpx_highbd_sad_skip_32x32_sse2
+
+void vpx_highbd_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x32x4d vpx_highbd_sad_skip_32x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_32x64 vpx_highbd_sad_skip_32x64_sse2
+
+void vpx_highbd_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_32x64x4d vpx_highbd_sad_skip_32x64x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x4 vpx_highbd_sad_skip_4x4_c
+
+void vpx_highbd_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x4x4d vpx_highbd_sad_skip_4x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_4x8 vpx_highbd_sad_skip_4x8_c
+
+void vpx_highbd_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_4x8x4d vpx_highbd_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x32 vpx_highbd_sad_skip_64x32_sse2
+
+void vpx_highbd_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x32x4d vpx_highbd_sad_skip_64x32x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_64x64 vpx_highbd_sad_skip_64x64_sse2
+
+void vpx_highbd_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_64x64x4d vpx_highbd_sad_skip_64x64x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x16 vpx_highbd_sad_skip_8x16_sse2
+
+void vpx_highbd_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x16x4d vpx_highbd_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_highbd_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x4 vpx_highbd_sad_skip_8x4_c
+
+void vpx_highbd_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x4x4d vpx_highbd_sad_skip_8x4x4d_c
+
+unsigned int vpx_highbd_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_highbd_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad_skip_8x8 vpx_highbd_sad_skip_8x8_sse2
+
+void vpx_highbd_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad_skip_8x8x4d vpx_highbd_sad_skip_8x8x4d_sse2
+
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
 #define vpx_highbd_satd vpx_highbd_satd_c
 
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height);
+#define vpx_highbd_sse vpx_highbd_sse_c
+
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
 #define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
 
@@ -1544,13 +1650,13 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *d
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b vpx_quantize_b_ssse3
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1561,8 +1667,8 @@ unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1573,8 +1679,8 @@ unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1585,8 +1691,8 @@ unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1597,8 +1703,8 @@ unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1609,8 +1715,8 @@ unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1621,8 +1727,8 @@ unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1633,8 +1739,8 @@ unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1645,8 +1751,8 @@ unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1657,8 +1763,8 @@ unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1669,8 +1775,8 @@ unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const ui
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1681,8 +1787,8 @@ unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uin
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1693,8 +1799,8 @@ unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1705,10 +1811,110 @@ unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
+unsigned int vpx_sad_skip_16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x16 vpx_sad_skip_16x16_sse2
+
+void vpx_sad_skip_16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x16x4d vpx_sad_skip_16x16x4d_sse2
+
+unsigned int vpx_sad_skip_16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x32 vpx_sad_skip_16x32_sse2
+
+void vpx_sad_skip_16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x32x4d vpx_sad_skip_16x32x4d_sse2
+
+unsigned int vpx_sad_skip_16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_16x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_16x8 vpx_sad_skip_16x8_sse2
+
+void vpx_sad_skip_16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_16x8x4d vpx_sad_skip_16x8x4d_sse2
+
+unsigned int vpx_sad_skip_32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x16 vpx_sad_skip_32x16_sse2
+
+void vpx_sad_skip_32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x16x4d vpx_sad_skip_32x16x4d_sse2
+
+unsigned int vpx_sad_skip_32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x32 vpx_sad_skip_32x32_sse2
+
+void vpx_sad_skip_32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x32x4d vpx_sad_skip_32x32x4d_sse2
+
+unsigned int vpx_sad_skip_32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_32x64 vpx_sad_skip_32x64_sse2
+
+void vpx_sad_skip_32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_32x64x4d vpx_sad_skip_32x64x4d_sse2
+
+unsigned int vpx_sad_skip_4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x4 vpx_sad_skip_4x4_c
+
+void vpx_sad_skip_4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x4x4d vpx_sad_skip_4x4x4d_c
+
+unsigned int vpx_sad_skip_4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_4x8 vpx_sad_skip_4x8_sse2
+
+void vpx_sad_skip_4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_4x8x4d vpx_sad_skip_4x8x4d_sse2
+
+unsigned int vpx_sad_skip_64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x32 vpx_sad_skip_64x32_sse2
+
+void vpx_sad_skip_64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x32x4d vpx_sad_skip_64x32x4d_sse2
+
+unsigned int vpx_sad_skip_64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_64x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_64x64 vpx_sad_skip_64x64_sse2
+
+void vpx_sad_skip_64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_64x64x4d vpx_sad_skip_64x64x4d_sse2
+
+unsigned int vpx_sad_skip_8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x16 vpx_sad_skip_8x16_sse2
+
+void vpx_sad_skip_8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x16x4d vpx_sad_skip_8x16x4d_sse2
+
+unsigned int vpx_sad_skip_8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x4 vpx_sad_skip_8x4_c
+
+void vpx_sad_skip_8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x4x4d vpx_sad_skip_8x4x4d_c
+
+unsigned int vpx_sad_skip_8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+unsigned int vpx_sad_skip_8x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_sad_skip_8x8 vpx_sad_skip_8x8_sse2
+
+void vpx_sad_skip_8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad_skip_8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_sad_skip_8x8x4d vpx_sad_skip_8x8x4d_sse2
+
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
@@ -1732,6 +1938,9 @@ void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
 void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 #define vpx_scaled_vert vpx_scaled_vert_c
 
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height);
+#define vpx_sse vpx_sse_c
+
 uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_sse2(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
 uint32_t vpx_sub_pixel_avg_variance16x16_ssse3(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index fa2cc50fd..00ab40fe2 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  13
+#define VERSION_MINOR  14
 #define VERSION_PATCH  0
-#define VERSION_EXTRA  "1559-gcd7dbca207"
+#define VERSION_EXTRA  "1616-g26104bbc9d"
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.13.0-1559-gcd7dbca207"
-#define VERSION_STRING      " v1.13.0-1559-gcd7dbca207"
+#define VERSION_STRING_NOSP "v1.14.0-1616-g26104bbc9d"
+#define VERSION_STRING      " v1.14.0-1616-g26104bbc9d"
diff --git a/configure b/configure
index ae289f77b..b212e0709 100755
--- a/configure
+++ b/configure
@@ -102,11 +102,14 @@ all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
 all_platforms="${all_platforms} arm64-darwin21-gcc"
 all_platforms="${all_platforms} arm64-darwin22-gcc"
+all_platforms="${all_platforms} arm64-darwin23-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
 all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs16-clangcl"
 all_platforms="${all_platforms} arm64-win64-vs17"
+all_platforms="${all_platforms} arm64-win64-vs17-clangcl"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -163,6 +166,7 @@ all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
 all_platforms="${all_platforms} x86_64-darwin21-gcc"
 all_platforms="${all_platforms} x86_64-darwin22-gcc"
+all_platforms="${all_platforms} x86_64-darwin23-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -243,12 +247,21 @@ CODEC_FAMILIES="
 
 ARCH_LIST="
     arm
+    aarch64
     mips
     x86
     x86_64
     ppc
     loongarch
 "
+
+ARCH_EXT_LIST_AARCH64="
+    neon
+    neon_dotprod
+    neon_i8mm
+    sve
+"
+
 ARCH_EXT_LIST_X86="
     mmx
     sse
@@ -268,8 +281,8 @@ ARCH_EXT_LIST_LOONGSON="
 "
 
 ARCH_EXT_LIST="
-    neon
     neon_asm
+    ${ARCH_EXT_LIST_AARCH64}
 
     mips32
     dspr2
@@ -293,6 +306,7 @@ EXPERIMENT_LIST="
     emulate_hardware
     non_greedy_mv
     rate_ctrl
+    collect_component_timing
 "
 CONFIG_LIST="
     dependency_tracking
@@ -342,7 +356,6 @@ CONFIG_LIST="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     vp9_highbitdepth
     better_hw_compatibility
@@ -406,7 +419,6 @@ CMDLINE_SELECT="
     multi_res_encoding
     temporal_denoising
     vp9_temporal_denoising
-    consistent_recode
     coefficient_range_checking
     better_hw_compatibility
     vp9_highbitdepth
@@ -633,7 +645,6 @@ process_toolchain() {
     if enabled gcc; then
         enabled werror && check_add_cflags -Werror
         check_add_cflags -Wall
-        check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
         check_add_cflags -Wextra-semi
         check_add_cflags -Wextra-semi-stmt
@@ -647,8 +658,9 @@ process_toolchain() {
         check_add_cflags -Wimplicit-function-declaration
         check_add_cflags -Wmissing-declarations
         check_add_cflags -Wmissing-prototypes
+        check_add_cflags -Wshadow
         check_add_cflags -Wuninitialized
-        check_add_cflags -Wunreachable-code-loop-increment
+        check_add_cflags -Wunreachable-code-aggressive
         check_add_cflags -Wunused
         check_add_cflags -Wextra
         # check_add_cflags also adds to cxxflags. gtest does not do well with
@@ -659,9 +671,8 @@ process_toolchain() {
         if enabled mips || [ -z "${INLINE}" ]; then
           enabled extra_warnings || check_add_cflags -Wno-unused-function
         fi
-        # Enforce c89 for c files. Don't be too strict about it though. Allow
-        # gnu extensions like "//" for comments.
-        check_cflags -std=gnu89 && add_cflags_only -std=gnu89
+        # Enforce C99 for C files. Allow GNU extensions.
+        check_cflags -std=gnu99 && add_cflags_only -std=gnu99
         # Avoid this warning for third_party C++ sources. Some reorganization
         # would be needed to apply this only to test/*.cc.
         check_cflags -Wshorten-64-to-32 && add_cflags_only -Wshorten-64-to-32
@@ -676,14 +687,18 @@ process_toolchain() {
         check_add_cxxflags -Wc++14-extensions
         check_add_cxxflags -Wc++17-extensions
         check_add_cxxflags -Wc++20-extensions
+        check_add_cxxflags -Wnon-virtual-dtor
 
-        # disable some warnings specific to libyuv.
+        # disable some warnings specific to libyuv / libwebm.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
         check_cxxflags -Wno-pass-failed \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
+        check_cxxflags -Wno-shadow \
+          && LIBWEBM_CXXFLAGS="${LIBWEBM_CXXFLAGS} -Wno-shadow" \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-shadow"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
diff --git a/examples.mk b/examples.mk
index 42886f1e1..22726a3d4 100644
--- a/examples.mk
+++ b/examples.mk
@@ -57,6 +57,7 @@ LIBWEBM_PARSER_SRCS = third_party/libwebm/mkvparser/mkvparser.cc \
 # Add compile flags and include path for libwebm sources.
 ifeq ($(CONFIG_WEBM_IO),yes)
   CXXFLAGS     += -D__STDC_CONSTANT_MACROS -D__STDC_LIMIT_MACROS
+  $(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
   INC_PATH-yes += $(SRC_PATH_BARE)/third_party/libwebm
 endif
 
@@ -81,8 +82,6 @@ ifeq ($(CONFIG_LIBYUV),yes)
   $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
 endif
 ifeq ($(CONFIG_WEBM_IO),yes)
-  vpxdec.SRCS                 += $(LIBWEBM_COMMON_SRCS)
-  vpxdec.SRCS                 += $(LIBWEBM_MUXER_SRCS)
   vpxdec.SRCS                 += $(LIBWEBM_PARSER_SRCS)
   vpxdec.SRCS                 += webmdec.cc webmdec.h
 endif
diff --git a/examples/resize_util.c b/examples/resize_util.c
index 7e529b2e2..5fb63e166 100644
--- a/examples/resize_util.c
+++ b/examples/resize_util.c
@@ -52,6 +52,7 @@ int main(int argc, char *argv[]) {
   uint8_t *inbuf_v, *outbuf_v;
   int f, frames;
   int width, height, target_width, target_height;
+  int failed = 0;
 
   exec_name = argv[0];
 
@@ -82,6 +83,7 @@ int main(int argc, char *argv[]) {
   }
   fpout = fopen(fout, "wb");
   if (fpout == NULL) {
+    fclose(fpin);
     printf("Can't open file %s to write\n", fout);
     usage();
     return 1;
@@ -100,6 +102,11 @@ int main(int argc, char *argv[]) {
 
   inbuf = (uint8_t *)malloc(width * height * 3 / 2);
   outbuf = (uint8_t *)malloc(target_width * target_height * 3 / 2);
+  if (!(inbuf && outbuf)) {
+    printf("Failed to allocate buffers.\n");
+    failed = 1;
+    goto Error;
+  }
   inbuf_u = inbuf + width * height;
   inbuf_v = inbuf_u + width * height / 4;
   outbuf_u = outbuf + target_width * target_height;
@@ -114,10 +121,11 @@ int main(int argc, char *argv[]) {
     f++;
   }
   printf("%d frames processed\n", f);
+Error:
   fclose(fpin);
   fclose(fpout);
 
   free(inbuf);
   free(outbuf);
-  return 0;
+  return failed;
 }
diff --git a/examples/svc_encodeframe.c b/examples/svc_encodeframe.c
index 003096e70..1dd731765 100644
--- a/examples/svc_encodeframe.c
+++ b/examples/svc_encodeframe.c
@@ -279,7 +279,7 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) {
   if (svc_ctx == NULL || options == NULL || si == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  strncpy(si->options, options, sizeof(si->options));
+  strncpy(si->options, options, sizeof(si->options) - 1);
   si->options[sizeof(si->options) - 1] = '\0';
   return VPX_CODEC_OK;
 }
@@ -381,7 +381,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
                              vpx_codec_iface_t *iface,
                              vpx_codec_enc_cfg_t *enc_cfg) {
   vpx_codec_err_t res;
-  int i, sl, tl;
+  int sl, tl;
   SvcInternal_t *const si = get_svc_internal(svc_ctx);
   if (svc_ctx == NULL || codec_ctx == NULL || iface == NULL ||
       enc_cfg == NULL) {
@@ -433,7 +433,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   }
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       si->svc_params.max_quantizers[i] = MAX_QUANTIZER;
       si->svc_params.min_quantizers[i] = 0;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
@@ -503,7 +503,7 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
 
   for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) {
     for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) {
-      i = sl * svc_ctx->temporal_layers + tl;
+      const int i = sl * svc_ctx->temporal_layers + tl;
       if (enc_cfg->rc_end_usage == VPX_CBR &&
           enc_cfg->g_pass == VPX_RC_ONE_PASS) {
         si->svc_params.max_quantizers[i] = enc_cfg->rc_max_quantizer;
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c
index d287e5831..998e4fb20 100644
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -32,6 +32,7 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "./y4minput.h"
 
+#define OUTPUT_FRAME_STATS 0
 #define OUTPUT_RC_STATS 1
 
 #define SIMULCAST_MODE 0
@@ -315,7 +316,6 @@ static void parse_command_line(int argc, const char **argv_,
           break;
         default:
           die("Error: Invalid bit depth selected (%d)\n", enc_cfg->g_bit_depth);
-          break;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     } else if (arg_match(&arg, &dropframe_thresh_arg, argi)) {
@@ -880,7 +880,9 @@ int main(int argc, const char **argv) {
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
   int end_of_stream = 0;
+#if OUTPUT_FRAME_STATS
   int frames_received = 0;
+#endif
 #if OUTPUT_RC_STATS
   VpxVideoWriter *outfile[VPX_SS_MAX_LAYERS] = { NULL };
   struct RateControlStats rc;
@@ -1126,14 +1128,14 @@ int main(int argc, const char **argv) {
             }
 #endif
           }
-          /*
+#if OUTPUT_FRAME_STATS
           printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
                  !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
                  (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-          */
+          ++frames_received;
+#endif
           if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1)
             si->bytes_sum[0] += (int)cx_pkt->data.frame.sz;
-          ++frames_received;
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
           if (vpx_codec_decode(&decoder, cx_pkt->data.frame.buf,
                                (unsigned int)cx_pkt->data.frame.sz, NULL, 0))
diff --git a/generate_config.sh b/generate_config.sh
index 79700d535..a8a43e149 100755
--- a/generate_config.sh
+++ b/generate_config.sh
@@ -210,7 +210,8 @@ intel="--disable-sse4_1 --disable-avx --disable-avx2 --disable-avx512 --as=yasm"
 gen_config_files x86 "--target=x86-linux-gcc ${intel} ${all_platforms}"
 gen_config_files x86_64 "--target=x86_64-linux-gcc ${intel} ${all_platforms}"
 gen_config_files arm-neon "--target=armv7-linux-gcc ${all_platforms}"
-gen_config_files arm64 "--force-target=armv8-linux-gcc ${all_platforms}"
+arm64="--disable-neon_dotprod --disable-neon_i8mm"
+gen_config_files arm64 "--target=armv8-linux-gcc ${arm64} ${all_platforms}"
 gen_config_files generic "--target=generic-gnu ${all_platforms}"
 
 echo "Remove temporary directory."
@@ -233,7 +234,7 @@ cd $TEMP_DIR
 gen_rtcd_header x86 x86 "${intel}"
 gen_rtcd_header x86_64 x86_64 "${intel}"
 gen_rtcd_header arm-neon armv7
-gen_rtcd_header arm64 armv8
+gen_rtcd_header arm64 armv8 "${arm64}"
 gen_rtcd_header generic generic
 
 echo "Prepare Makefile."
diff --git a/libs.mk b/libs.mk
index 13d878901..596438671 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
 INSTALL-LIBS-yes += include/vpx/vpx_integer.h
 INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
 INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
 ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
 ifeq ($(CONFIG_MSVS),yes)
 INSTALL-LIBS-yes                  += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
@@ -312,9 +313,9 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 8
+SO_VERSION_MAJOR := 9
 SO_VERSION_MINOR := 0
-SO_VERSION_PATCH := 1
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 SHARED_LIB_SUF          := .dylib
@@ -545,7 +546,7 @@ testdata: $(LIBVPX_TEST_DATA)
             echo "Checking test data:";\
             for f in $(call enabled,LIBVPX_TEST_DATA); do\
                 grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
-                    (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
+                    (cd "$(LIBVPX_TEST_DATA_PATH)"; $${sha1sum} -c);\
             done; \
         else\
             echo "Skipping test data integrity check, sha1sum not found.";\
@@ -631,8 +632,8 @@ test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
             -I. -I"$(SRC_PATH_BARE)/third_party/googletest/src/include" \
             -L. -l$(CODEC_LIB) -l$(RC_RTC_LIB) -l$(GTEST_LIB) $^
 endif  # RC_INTERFACE_TEST
-endif  # CONFIG_VP9_ENCODER
-endif
+endif  # CONFIG_ENCODERS
+endif  # CONFIG_MSVS
 else
 
 include $(SRC_PATH_BARE)/third_party/googletest/gtest.mk
@@ -699,7 +700,7 @@ $(eval $(call linkerxx_template,$(SIMPLE_ENCODE_TEST_BIN), \
               -L. -lsimple_encode -lvpx -lgtest $(extralibs) -lm))
 endif  # SIMPLE_ENCODE_TEST
 
-endif  # CONFIG_UNIT_TESTS
+endif  # CONFIG_EXTERNAL_BUILD
 
 # Install test sources only if codec source is included
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(patsubst $(SRC_PATH_BARE)/%,%,\
@@ -724,7 +725,7 @@ NUM_SHARDS := 10
 SHARDS := 0 1 2 3 4 5 6 7 8 9
 $(foreach s,$(SHARDS),$(eval $(call test_shard_template,$(s),$(NUM_SHARDS))))
 
-endif
+endif  # CONFIG_UNIT_TESTS
 
 ##
 ## documentation directives
@@ -764,10 +765,10 @@ TEST_BIN_PATH := $(addsuffix /$(TGT_OS:win64=x64)/Release, $(TEST_BIN_PATH))
 endif
 utiltest utiltest-no-data-check:
 	$(qexec)$(SRC_PATH_BARE)/test/vpxdec.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 	$(qexec)$(SRC_PATH_BARE)/test/vpxenc.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(TEST_BIN_PATH)
 utiltest: testdata
 else
@@ -791,7 +792,7 @@ EXAMPLES_BIN_PATH := $(TGT_OS:win64=x64)/Release
 endif
 exampletest exampletest-no-data-check: examples
 	$(qexec)$(SRC_PATH_BARE)/test/examples.sh \
-		--test-data-path $(LIBVPX_TEST_DATA_PATH) \
+		--test-data-path "$(LIBVPX_TEST_DATA_PATH)" \
 		--bin-path $(EXAMPLES_BIN_PATH)
 exampletest: testdata
 else
diff --git a/test/acm_random.h b/test/acm_random.h
index c7122b933..e3520c47d 100644
--- a/test/acm_random.h
+++ b/test/acm_random.h
@@ -45,16 +45,11 @@ class ACMRandom {
     return static_cast<int16_t>(random_.Generate(65536));
   }
 
-  int16_t Rand13Signed() {
-    // Use 13 bits: values between 4095 and -4096.
-    const uint32_t value = random_.Generate(8192);
-    return static_cast<int16_t>(value) - 4096;
-  }
-
-  int16_t Rand9Signed() {
-    // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
-    const uint32_t value = random_.Generate(512);
-    return static_cast<int16_t>(value) - 256;
+  uint16_t Rand12() {
+    const uint32_t value =
+        random_.Generate(testing::internal::Random::kMaxRange);
+    // There's a bit more entropy in the upper bits of this implementation.
+    return (value >> 19) & 0xfff;
   }
 
   uint8_t Rand8() {
diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc
index 68d8856ea..ad067346a 100644
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -62,16 +62,16 @@ class ActiveMapRefreshTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ActiveMapRefreshTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapRefreshTest() {}
+  ~ActiveMapRefreshTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     ::libvpx_test::Y4mVideoSource *y4m_video =
         static_cast<libvpx_test::Y4mVideoSource *>(video);
     if (video->frame() == 0) {
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index 543ec0d35..d222c00b7 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -26,16 +26,16 @@ class ActiveMapTest
   static const int kHeight = 144;
 
   ActiveMapTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ActiveMapTest() {}
+  ~ActiveMapTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
diff --git a/test/add_noise_test.cc b/test/add_noise_test.cc
index 7dc86e3eb..4fc4e81e6 100644
--- a/test/add_noise_test.cc
+++ b/test/add_noise_test.cc
@@ -32,8 +32,8 @@ typedef std::tuple<double, AddNoiseFunc> AddNoiseTestFPParam;
 class AddNoiseTest : public ::testing::Test,
                      public ::testing::WithParamInterface<AddNoiseTestFPParam> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
-  virtual ~AddNoiseTest() {}
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+  ~AddNoiseTest() override = default;
 };
 
 double stddev6(char a, char b, char c, char d, char e, char f) {
diff --git a/test/alt_ref_aq_segment_test.cc b/test/alt_ref_aq_segment_test.cc
index 00a00e27c..3b1a26ed1 100644
--- a/test/alt_ref_aq_segment_test.cc
+++ b/test/alt_ref_aq_segment_test.cc
@@ -20,9 +20,9 @@ class AltRefAqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AltRefAqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AltRefAqSegmentTest() {}
+  ~AltRefAqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -30,8 +30,8 @@ class AltRefAqSegmentTest
     alt_ref_aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_ALT_REF_AQ, alt_ref_aq_mode_);
diff --git a/test/altref_test.cc b/test/altref_test.cc
index 69bcef774..903230fde 100644
--- a/test/altref_test.cc
+++ b/test/altref_test.cc
@@ -24,24 +24,24 @@ class AltRefTest : public ::libvpx_test::EncoderTest,
                    public ::libvpx_test::CodecTestWithParam<int> {
  protected:
   AltRefTest() : EncoderTest(GET_PARAM(0)), altref_count_(0) {}
-  virtual ~AltRefTest() {}
+  ~AltRefTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { altref_count_ = 0; }
+  void BeginPassHook(unsigned int /*pass*/) override { altref_count_ = 0; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
       encoder->Control(VP8E_SET_CPUUSED, 3);
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_INVISIBLE) ++altref_count_;
   }
 
@@ -75,17 +75,17 @@ class AltRefForcedKeyTestLarge
   AltRefForcedKeyTestLarge()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), forced_kf_frame_num_(1), frame_num_(0) {}
-  virtual ~AltRefForcedKeyTestLarge() {}
+  ~AltRefForcedKeyTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     cfg_.rc_end_usage = VPX_VBR;
     cfg_.g_threads = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -100,7 +100,7 @@ class AltRefForcedKeyTestLarge
         (video->frame() == forced_kf_frame_num_) ? VPX_EFLAG_FORCE_KF : 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame_num_ == forced_kf_frame_num_) {
       ASSERT_TRUE(!!(pkt->data.frame.flags & VPX_FRAME_IS_KEY))
           << "Frame #" << frame_num_ << " isn't a keyframe!";
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 2cbc991d0..955e1dafc 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -20,17 +20,17 @@ class AqSegmentTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   AqSegmentTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~AqSegmentTest() {}
+  ~AqSegmentTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
     aq_mode_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 196522ce5..ede9c0ba8 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -38,7 +38,7 @@ class AverageTestBase : public ::testing::Test {
       : width_(width), height_(height), source_data_(nullptr),
         source_stride_(0), bit_depth_(8) {}
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
     source_data_ = nullptr;
     libvpx_test::ClearSystemState();
@@ -49,7 +49,7 @@ class AverageTestBase : public ::testing::Test {
   static const int kDataAlignment = 16;
   static const int kDataBlockSize = 64 * 128;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<Pixel *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -169,7 +169,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     source_data_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize * sizeof(source_data_[0])));
     ASSERT_NE(source_data_, nullptr);
@@ -180,7 +180,7 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
         vpx_memalign(kDataAlignment, sizeof(*hbuf_c_) * 16));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data_);
     source_data_ = nullptr;
     vpx_free(hbuf_c_);
@@ -190,8 +190,9 @@ class IntProRowTest : public AverageTestBase<uint8_t>,
   }
 
   void RunComparison() {
-    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
-    ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+    ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+    ASM_REGISTER_STATE_CHECK(
+        asm_func_(hbuf_asm_, source_data_, width_, height_));
     EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
         << "Output mismatch";
   }
@@ -238,7 +239,7 @@ typedef std::tuple<int, SatdFunc> SatdTestParam;
 class SatdTest : public ::testing::Test,
                  public ::testing::WithParamInterface<SatdTestParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     satd_size_ = GET_PARAM(0);
     satd_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -247,7 +248,7 @@ class SatdTest : public ::testing::Test,
     ASSERT_NE(src_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(src_);
   }
@@ -276,7 +277,7 @@ class SatdTest : public ::testing::Test,
 
 class SatdLowbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       const int16_t tmp = rnd_.Rand16Signed();
       src_[i] = (tran_low_t)tmp;
@@ -292,7 +293,7 @@ class BlockErrorTestFP
     : public ::testing::Test,
       public ::testing::WithParamInterface<BlockErrorTestFPParam> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     txfm_size_ = GET_PARAM(0);
     block_error_func_ = GET_PARAM(1);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -304,7 +305,7 @@ class BlockErrorTestFP
     ASSERT_NE(dqcoeff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     libvpx_test::ClearSystemState();
     vpx_free(coeff_);
     vpx_free(dqcoeff_);
@@ -463,7 +464,7 @@ TEST_P(SatdLowbdTest, DISABLED_Speed) {
 #if CONFIG_VP9_HIGHBITDEPTH
 class SatdHighbdTest : public SatdTest {
  protected:
-  virtual void FillRandom() {
+  void FillRandom() override {
     for (int i = 0; i < satd_size_; ++i) {
       src_[i] = rnd_.Rand20Signed();
     }
@@ -582,6 +583,13 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_sse2)));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AverageTestHBD,
+    ::testing::Values(make_tuple(16, 16, 1, 8, &vpx_highbd_avg_8x8_neon),
+                      make_tuple(16, 16, 1, 4, &vpx_highbd_avg_4x4_neon)));
+#endif  // HAVE_NEON
+
 INSTANTIATE_TEST_SUITE_P(C, SatdHighbdTest,
                          ::testing::Values(make_tuple(16, &vpx_satd_c),
                                            make_tuple(64, &vpx_satd_c),
@@ -694,16 +702,21 @@ INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
                                            make_tuple(256, &vpx_satd_neon),
                                            make_tuple(1024, &vpx_satd_neon)));
 
-// TODO(jianj): Remove the highbitdepth flag once the SIMD functions are
-// in place.
-#if !CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, SatdHighbdTest,
+    ::testing::Values(make_tuple(16, &vpx_highbd_satd_neon),
+                      make_tuple(64, &vpx_highbd_satd_neon),
+                      make_tuple(256, &vpx_highbd_satd_neon),
+                      make_tuple(1024, &vpx_highbd_satd_neon)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, BlockErrorTestFP,
     ::testing::Values(make_tuple(16, &vp9_block_error_fp_neon),
                       make_tuple(64, &vp9_block_error_fp_neon),
                       make_tuple(256, &vp9_block_error_fp_neon),
                       make_tuple(1024, &vp9_block_error_fp_neon)));
-#endif  // !CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/test/bench.h b/test/bench.h
index 57ca9118b..203e4d247 100644
--- a/test/bench.h
+++ b/test/bench.h
@@ -16,6 +16,8 @@
 
 class AbstractBench {
  public:
+  virtual ~AbstractBench() = default;
+
   void RunNTimes(int n);
   void PrintMedian(const char *title);
 
diff --git a/test/blockiness_test.cc b/test/blockiness_test.cc
index 11b2a3f61..5a45bc0b7 100644
--- a/test/blockiness_test.cc
+++ b/test/blockiness_test.cc
@@ -49,14 +49,14 @@ class BlockinessTestBase : public ::testing::Test {
     reference_data_ = nullptr;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
diff --git a/test/borders_test.cc b/test/borders_test.cc
index 3c1f69a92..2726bd557 100644
--- a/test/borders_test.cc
+++ b/test/borders_test.cc
@@ -22,15 +22,15 @@ class BordersTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   BordersTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~BordersTest() {}
+  ~BordersTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 1);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -40,7 +40,7 @@ class BordersTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
     }
   }
diff --git a/test/byte_alignment_test.cc b/test/byte_alignment_test.cc
index 1e0ffceb8..ba6fffc52 100644
--- a/test/byte_alignment_test.cc
+++ b/test/byte_alignment_test.cc
@@ -58,7 +58,7 @@ class ByteAlignmentTest
   ByteAlignmentTest()
       : video_(nullptr), decoder_(nullptr), md5_file_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
@@ -71,7 +71,7 @@ class ByteAlignmentTest
     OpenMd5File(kVP9Md5File);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (md5_file_ != nullptr) fclose(md5_file_);
 
     delete decoder_;
diff --git a/test/codec_factory.h b/test/codec_factory.h
index 96092610c..d00563df1 100644
--- a/test/codec_factory.h
+++ b/test/codec_factory.h
@@ -84,7 +84,7 @@ class VP8Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
@@ -100,7 +100,7 @@ class VP8Encoder : public Encoder {
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
@@ -113,12 +113,12 @@ class VP8CodecFactory : public CodecFactory {
  public:
   VP8CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP8_DECODER
     return new VP8Decoder(cfg, flags);
 #else
@@ -128,10 +128,9 @@ class VP8CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP8_ENCODER
     return new VP8Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -143,8 +142,8 @@ class VP8CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP8_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp8_cx_algo, cfg, usage);
 #else
@@ -180,7 +179,7 @@ class VP9Decoder : public Decoder {
       : Decoder(cfg, flag) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
@@ -196,7 +195,7 @@ class VP9Encoder : public Encoder {
       : Encoder(cfg, deadline, init_flags, stats) {}
 
  protected:
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
@@ -209,12 +208,12 @@ class VP9CodecFactory : public CodecFactory {
  public:
   VP9CodecFactory() : CodecFactory() {}
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg) const override {
     return CreateDecoder(cfg, 0);
   }
 
-  virtual Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
-                                 const vpx_codec_flags_t flags) const {
+  Decoder *CreateDecoder(vpx_codec_dec_cfg_t cfg,
+                         const vpx_codec_flags_t flags) const override {
 #if CONFIG_VP9_DECODER
     return new VP9Decoder(cfg, flags);
 #else
@@ -224,10 +223,9 @@ class VP9CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg,
-                                 unsigned long deadline,
-                                 const unsigned long init_flags,
-                                 TwopassStatsStore *stats) const {
+  Encoder *CreateEncoder(vpx_codec_enc_cfg_t cfg, unsigned long deadline,
+                         const unsigned long init_flags,
+                         TwopassStatsStore *stats) const override {
 #if CONFIG_VP9_ENCODER
     return new VP9Encoder(cfg, deadline, init_flags, stats);
 #else
@@ -239,8 +237,8 @@ class VP9CodecFactory : public CodecFactory {
 #endif
   }
 
-  virtual vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
-                                               int usage) const {
+  vpx_codec_err_t DefaultEncoderConfig(vpx_codec_enc_cfg_t *cfg,
+                                       int usage) const override {
 #if CONFIG_VP9_ENCODER
     return vpx_codec_enc_config_default(&vpx_codec_vp9_cx_algo, cfg, usage);
 #else
diff --git a/test/comp_avg_pred_test.cc b/test/comp_avg_pred_test.cc
index 70aeab8d7..3234cc9a2 100644
--- a/test/comp_avg_pred_test.cc
+++ b/test/comp_avg_pred_test.cc
@@ -49,7 +49,7 @@ using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
 template <int bitdepth, typename Pixel>
 class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     avg_pred_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -81,11 +81,11 @@ void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
         // Only the reference buffer may have a stride not equal to width.
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg_chk.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -121,11 +121,11 @@ void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
   const int height = 32;
   Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(pred.Init());
-  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 32);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
@@ -167,9 +167,9 @@ void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
         const int height = 1 << height_pow;
         Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(pred.Init());
-        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 32);
         ASSERT_TRUE(avg.Init());
         const int bitdepth_mask = (1 << bitdepth) - 1;
         for (int h = 0; h < height; ++h) {
@@ -217,6 +217,11 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, AvgPredTestLBD,
+                         ::testing::Values(&vpx_comp_avg_pred_avx2));
+#endif  // HAVE_AVX2
+
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
@@ -260,5 +265,11 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_neon>));
+#endif  // HAVE_NEON
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/config_test.cc b/test/config_test.cc
index 8f4c60e11..729b01151 100644
--- a/test/config_test.cc
+++ b/test/config_test.cc
@@ -22,24 +22,24 @@ class ConfigTest
   ConfigTest()
       : EncoderTest(GET_PARAM(0)), frame_count_in_(0), frame_count_out_(0),
         frame_count_max_(0) {}
-  virtual ~ConfigTest() {}
+  ~ConfigTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     frame_count_in_ = 0;
     frame_count_out_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource * /*video*/) override {
     ++frame_count_in_;
     abort_ |= (frame_count_in_ >= frame_count_max_);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) {
+  void FramePktHook(const vpx_codec_cx_pkt_t * /*pkt*/) override {
     ++frame_count_out_;
   }
 
diff --git a/test/consistency_test.cc b/test/consistency_test.cc
index f0e2cb297..5e872e70a 100644
--- a/test/consistency_test.cc
+++ b/test/consistency_test.cc
@@ -65,14 +65,14 @@ class ConsistencyTestBase : public ::testing::Test {
     delete[] ssim_array_;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   // Handle frames up to 640x480
   static const int kDataAlignment = 16;
   static const int kDataBufferSize = 640 * 480;
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_stride_ = (width_ + 31) & ~31;
     reference_stride_ = width_ * 2;
     rnd_.Reset(ACMRandom::DeterministicSeed());
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index d56904869..ffd5c41c6 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -244,7 +244,7 @@ void highbd_filter_block2d_8_c(const uint16_t *src_ptr,
 
   // Vertical pass (transposed intermediate -> dst).
   {
-    uint16_t *src_ptr = intermediate_buffer;
+    src_ptr = intermediate_buffer;
     const int dst_next_row_stride = dst_stride - output_width;
     unsigned int i, j;
     for (i = 0; i < output_height; ++i) {
@@ -361,7 +361,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
 #endif
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
   static void TearDownTestSuite() {
     vpx_free(input_ - 1);
@@ -403,7 +403,7 @@ class ConvolveTest : public ::testing::TestWithParam<ConvolveParam> {
             i % kOuterBlockSize >= (BorderLeft() + Width()));
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     UUT_ = GET_PARAM(2);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ != 0) {
@@ -1423,6 +1423,36 @@ INSTANTIATE_TEST_SUITE_P(NEON, ConvolveTest,
                          ::testing::ValuesIn(kArrayConvolve_neon));
 #endif  // HAVE_NEON
 
+#if HAVE_NEON_DOTPROD
+const ConvolveFunctions convolve8_neon_dotprod(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_dotprod,
+    vpx_convolve8_avg_horiz_neon_dotprod, vpx_convolve8_vert_neon_dotprod,
+    vpx_convolve8_avg_vert_neon_dotprod, vpx_convolve8_neon_dotprod,
+    vpx_convolve8_avg_neon_dotprod, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_dotprod[] = { ALL_SIZES(
+    convolve8_neon_dotprod) };
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_dotprod));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_NEON_I8MM
+const ConvolveFunctions convolve8_neon_i8mm(
+    vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_neon_i8mm,
+    vpx_convolve8_avg_horiz_neon_i8mm, vpx_convolve8_vert_neon_i8mm,
+    vpx_convolve8_avg_vert_neon_i8mm, vpx_convolve8_neon_i8mm,
+    vpx_convolve8_avg_neon_i8mm, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c,
+    vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c,
+    vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve_neon_i8mm[] = { ALL_SIZES(
+    convolve8_neon_i8mm) };
+INSTANTIATE_TEST_SUITE_P(NEON_I8MM, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve_neon_i8mm));
+#endif  // HAVE_NEON_I8MM
+
 #if HAVE_DSPR2
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2, vpx_convolve8_horiz_dspr2,
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index a7623f09a..22f455296 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -26,9 +26,9 @@ class CpuSpeedTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)), min_psnr_(kMaxPSNR),
         tune_content_(VP9E_CONTENT_DEFAULT) {}
-  virtual ~CpuSpeedTest() {}
+  ~CpuSpeedTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -40,10 +40,10 @@ class CpuSpeedTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) { min_psnr_ = kMaxPSNR; }
+  void BeginPassHook(unsigned int /*pass*/) override { min_psnr_ = kMaxPSNR; }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, tune_content_);
@@ -56,7 +56,7 @@ class CpuSpeedTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) min_psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/cq_test.cc b/test/cq_test.cc
index 292adb0d0..b74915a33 100644
--- a/test/cq_test.cc
+++ b/test/cq_test.cc
@@ -50,21 +50,21 @@ class CQTest : public ::libvpx_test::EncoderTest,
     init_flags_ = VPX_CODEC_USE_PSNR;
   }
 
-  virtual ~CQTest() {}
+  ~CQTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     file_size_ = 0;
     psnr_ = 0.0;
     n_frames_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       if (cfg_.rc_end_usage == VPX_CQ) {
         encoder->Control(VP8E_SET_CQ_LEVEL, cq_level_);
@@ -73,12 +73,12 @@ class CQTest : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pow(10.0, pkt->data.psnr.psnr[0] / 10.0);
     n_frames_++;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     file_size_ += pkt->data.frame.sz;
   }
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d4ef7ae13..8c4213ee1 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -27,6 +27,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -309,7 +310,7 @@ void idct16x16_10_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 
 class Trans16x16TestBase {
  public:
-  virtual ~Trans16x16TestBase() {}
+  virtual ~Trans16x16TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -548,12 +549,50 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunSpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+
+    DECLARE_ALIGNED(32, int16_t, input_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_ref_block[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, output_block[kNumCoeffs]);
+
+    // Initialize a test block with input range [-mask_, mask_].
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    vpx_usec_timer timer_c;
+    vpx_usec_timer_start(&timer_c);
+    for (int i = 0; i < count_test_block; ++i) {
+      vpx_fdct16x16_c(input_block, output_ref_block, pitch_);
+    }
+    vpx_usec_timer_mark(&timer_c);
+    c_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_c));
+
+    vpx_usec_timer timer_mod;
+    vpx_usec_timer_start(&timer_mod);
+    for (int i = 0; i < count_test_block; ++i) {
+      RunFwdTxfm(input_block, output_block, pitch_);
+    }
+
+    vpx_usec_timer_mark(&timer_mod);
+    simd_sum_time += static_cast<int>(vpx_usec_timer_elapsed(&timer_mod));
+
+    printf(
+        "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+        simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   void CompareInvReference(IdctFunc ref_txfm, int thresh) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int count_test_block = 10000;
     const int eob = 10;
     const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
     DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -604,6 +643,80 @@ class Trans16x16TestBase {
     }
   }
 
+  void RunInvTrans16x16SpeedTest(IdctFunc ref_txfm, int thresh) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 10;
+    const int16_t *scan = vp9_default_scan_orders[TX_16X16].scan;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        ref_txfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
   int pitch_;
   int tx_type_;
   vpx_bit_depth_t bit_depth_;
@@ -615,9 +728,9 @@ class Trans16x16TestBase {
 class Trans16x16DCT : public Trans16x16TestBase,
                       public ::testing::TestWithParam<Dct16x16Param> {
  public:
-  virtual ~Trans16x16DCT() {}
+  ~Trans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -636,13 +749,13 @@ class Trans16x16DCT : public Trans16x16TestBase,
     inv_txfm_ref = idct16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -664,12 +777,14 @@ TEST_P(Trans16x16DCT, QuantCheck) {
 
 TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 
+TEST_P(Trans16x16DCT, DISABLED_Speed) { RunSpeedTest(); }
+
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
-  virtual ~Trans16x16HT() {}
+  ~Trans16x16HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -688,13 +803,13 @@ class Trans16x16HT : public Trans16x16TestBase,
     inv_txfm_ref = iht16x16_ref;
 #endif
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -714,13 +829,12 @@ TEST_P(Trans16x16HT, QuantCheck) {
   RunQuantCheck(429, 729);
 }
 
-#if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 class InvTrans16x16DCT : public Trans16x16TestBase,
                          public ::testing::TestWithParam<Idct16x16Param> {
  public:
-  virtual ~InvTrans16x16DCT() {}
+  ~InvTrans16x16DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -728,11 +842,12 @@ class InvTrans16x16DCT : public Trans16x16TestBase,
     pitch_ = 16;
     mask_ = (1 << bit_depth_) - 1;
   }
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/, int /*stride*/) {}
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunFwdTxfm(int16_t * /*in*/, tran_low_t * /*out*/,
+                  int /*stride*/) override {}
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -745,7 +860,10 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans16x16DCT);
 TEST_P(InvTrans16x16DCT, CompareReference) {
   CompareInvReference(ref_txfm_, thresh_);
 }
-#endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+TEST_P(InvTrans16x16DCT, DISABLED_Speed) {
+  RunInvTrans16x16SpeedTest(ref_txfm_, thresh_);
+}
 
 using std::make_tuple;
 
@@ -787,6 +905,12 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(C, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_idct16x16_256_add_c,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      6225, VPX_BITS_8)));
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -821,8 +945,25 @@ INSTANTIATE_TEST_SUITE_P(
                                  2, VPX_BITS_8),
                       make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2,
                                  3, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(SSE2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_sse2, 6225, VPX_BITS_8)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, Trans16x16DCT,
+    ::testing::Values(make_tuple(&vpx_fdct16x16_avx2,
+                                 &vpx_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(AVX2, InvTrans16x16DCT,
+                         ::testing::Values(make_tuple(
+                             &vpx_idct16x16_256_add_c,
+                             &vpx_idct16x16_256_add_avx2, 6225, VPX_BITS_8)));
+#endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 91bb8e01e..6233b17a4 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -24,10 +24,12 @@
 #include "test/register_state_check.h"
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/msvc.h"  // for round()
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
 
@@ -71,6 +73,9 @@ typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
 typedef std::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
     Trans32x32Param;
 
+typedef std::tuple<InvTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t, int, int>
+    InvTrans32x32Param;
+
 #if CONFIG_VP9_HIGHBITDEPTH
 void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
   vpx_highbd_idct32x32_1024_add_c(in, CAST_TO_SHORTPTR(out), stride, 10);
@@ -84,8 +89,8 @@ void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
 class Trans32x32Test : public AbstractBench,
                        public ::testing::TestWithParam<Trans32x32Param> {
  public:
-  virtual ~Trans32x32Test() {}
-  virtual void SetUp() {
+  ~Trans32x32Test() override = default;
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     version_ = GET_PARAM(2);  // 0: high precision forward transform
@@ -94,7 +99,7 @@ class Trans32x32Test : public AbstractBench,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int version_;
@@ -105,7 +110,7 @@ class Trans32x32Test : public AbstractBench,
 
   int16_t *bench_in_;
   tran_low_t *bench_out_;
-  virtual void Run();
+  void Run() override;
 };
 
 void Trans32x32Test::Run() { fwd_txfm_(bench_in_, bench_out_, 32); }
@@ -314,6 +319,174 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
   }
 }
 
+class InvTrans32x32Test : public ::testing::TestWithParam<InvTrans32x32Param> {
+ public:
+  ~InvTrans32x32Test() override = default;
+  void SetUp() override {
+    ref_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    version_ = GET_PARAM(2);  // 0: high precision forward transform
+                              // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    eob_ = GET_PARAM(4);
+    thresh_ = GET_PARAM(4);
+    mask_ = (1 << bit_depth_) - 1;
+    pitch_ = 32;
+  }
+
+  void TearDown() override { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunRefTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    ref_txfm_(out, dst, stride);
+  }
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
+  }
+  int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  int eob_;
+  int thresh_;
+
+  InvTxfmFunc ref_txfm_;
+  InvTxfmFunc inv_txfm_;
+  int pitch_;
+
+  void RunInvTrans32x32SpeedTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    int64_t c_sum_time = 0;
+    int64_t simd_sum_time = 0;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int j = 0; j < kNumCoeffs; ++j) {
+      if (j < eob_) {
+        // Random values less than the threshold, either positive or negative
+        coeff[scan[j]] = rnd(thresh_);
+      } else {
+        coeff[scan[j]] = 0;
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        dst[j] = 0;
+        ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        dst16[j] = 0;
+        ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+    }
+
+    if (bit_depth_ == VPX_BITS_8) {
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, ref, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, dst, pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+    } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_usec_timer timer_c;
+      vpx_usec_timer_start(&timer_c);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_c);
+      c_sum_time += vpx_usec_timer_elapsed(&timer_c);
+
+      vpx_usec_timer timer_mod;
+      vpx_usec_timer_start(&timer_mod);
+      for (int i = 0; i < count_test_block; ++i) {
+        RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_);
+      }
+      vpx_usec_timer_mark(&timer_mod);
+      simd_sum_time += vpx_usec_timer_elapsed(&timer_mod);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+    printf(
+        "c_time = %" PRId64 " \t simd_time = %" PRId64 " \t Gain = %4.2f \n",
+        c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+  }
+
+  void CompareInvReference32x32() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 10000;
+    const int eob = 31;
+    const int16_t *scan = vp9_default_scan_orders[TX_32X32].scan;
+    DECLARE_ALIGNED(32, tran_low_t, coeff[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint8_t, ref[kNumCoeffs]);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
+    DECLARE_ALIGNED(16, uint16_t, ref16[kNumCoeffs]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    for (int i = 0; i < count_test_block; ++i) {
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (j < eob) {
+          coeff[scan[j]] = rnd.Rand8Extremes();
+        } else {
+          coeff[scan[j]] = 0;
+        }
+        if (bit_depth_ == VPX_BITS_8) {
+          dst[j] = 0;
+          ref[j] = 0;
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          dst16[j] = 0;
+          ref16[j] = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        RunRefTxfm(coeff, ref, pitch_);
+        RunInvTxfm(coeff, dst, pitch_);
+      } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+        RunRefTxfm(coeff, CAST_TO_BYTEPTR(ref16), pitch_);
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(coeff, CAST_TO_BYTEPTR(dst16), pitch_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - ref[j] : dst16[j] - ref16[j];
+#else
+        const uint32_t diff = dst[j] - ref[j];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t error = diff * diff;
+        EXPECT_EQ(0u, error) << "Error: 32x32 IDCT Comparison has error "
+                             << error << " at index " << j;
+      }
+    }
+  }
+};
+
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(InvTrans32x32Test);
+
+TEST_P(InvTrans32x32Test, DISABLED_Speed) { RunInvTrans32x32SpeedTest(); }
+TEST_P(InvTrans32x32Test, CompareReference) { CompareInvReference32x32(); }
+
 using std::make_tuple;
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -334,6 +507,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_c, &vpx_idct32x32_1024_add_c,
                                  1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    C, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_c, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_c, 0,
+                   VPX_BITS_8, 16, 6255)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
@@ -352,6 +533,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_sse2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_sse2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_sse2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -377,6 +566,14 @@ INSTANTIATE_TEST_SUITE_P(
                                  &vpx_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
                       make_tuple(&vpx_fdct32x32_rd_avx2,
                                  &vpx_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, InvTrans32x32Test,
+    ::testing::Values(
+        (make_tuple(&vpx_idct32x32_1024_add_c, &vpx_idct32x32_1024_add_avx2, 0,
+                    VPX_BITS_8, 32, 6225)),
+        make_tuple(&vpx_idct32x32_135_add_c, &vpx_idct32x32_135_add_avx2, 0,
+                   VPX_BITS_8, 16, 6225)));
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
diff --git a/test/dct_partial_test.cc b/test/dct_partial_test.cc
index e57fa0f48..ec6f543f7 100644
--- a/test/dct_partial_test.cc
+++ b/test/dct_partial_test.cc
@@ -67,7 +67,7 @@ class PartialFdctTest : public ::testing::TestWithParam<PartialFdctParam> {
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void RunTest() {
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 0304029bd..c3d3081c4 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -134,7 +134,7 @@ void fwht_ref(const Buffer<int16_t> &in, Buffer<tran_low_t> *out, int size,
 
 class TransTestBase : public ::testing::TestWithParam<DctParam> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     const int idx = GET_PARAM(0);
     const FuncInfo *func_info = &(GET_PARAM(1)[idx]);
@@ -166,7 +166,7 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     ASSERT_NE(dst_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(src_);
     src_ = nullptr;
     vpx_free(dst_);
@@ -358,14 +358,6 @@ class TransTestBase : public ::testing::TestWithParam<DctParam> {
     ASSERT_TRUE(in.Init());
     Buffer<tran_low_t> coeff = Buffer<tran_low_t>(size_, size_, 0, 16);
     ASSERT_TRUE(coeff.Init());
-    Buffer<uint8_t> dst = Buffer<uint8_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst.Init());
-    Buffer<uint8_t> src = Buffer<uint8_t>(size_, size_, 0);
-    ASSERT_TRUE(src.Init());
-    Buffer<uint16_t> dst16 = Buffer<uint16_t>(size_, size_, 0, 16);
-    ASSERT_TRUE(dst16.Init());
-    Buffer<uint16_t> src16 = Buffer<uint16_t>(size_, size_, 0);
-    ASSERT_TRUE(src16.Init());
 
     for (int i = 0; i < count_test_block; ++i) {
       InitMem();
@@ -671,8 +663,12 @@ static const FuncInfo ht_neon_func_info[] = {
     4, 2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
+  { &vp9_highbd_fht8x8_neon, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>,
+    8, 2 },
   { &vp9_highbd_fht16x16_c,
     &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
+  { &vp9_highbd_fht16x16_neon,
+    &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
diff --git a/test/decode_api_test.cc b/test/decode_api_test.cc
index 9e82ace1b..44e439772 100644
--- a/test/decode_api_test.cc
+++ b/test/decode_api_test.cc
@@ -20,7 +20,7 @@ namespace {
 #define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
 
 TEST(DecodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
+  static vpx_codec_iface_t *kCodecs[] = {
 #if CONFIG_VP8_DECODER
     &vpx_codec_vp8_dx_algo,
 #endif
@@ -120,7 +120,7 @@ void TestVp9Controls(vpx_codec_ctx_t *dec) {
 }
 
 TEST(DecodeAPI, Vp9InvalidDecode) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   const char filename[] =
       "invalid-vp90-2-00-quantizer-00.webm.ivf.s5861_r01-05_b6-.v2.ivf";
   libvpx_test::IVFVideoSource video(filename);
@@ -147,7 +147,7 @@ TEST(DecodeAPI, Vp9InvalidDecode) {
 
 void TestPeekInfo(const uint8_t *const data, uint32_t data_sz,
                   uint32_t peek_size) {
-  const vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
+  vpx_codec_iface_t *const codec = &vpx_codec_vp9_dx_algo;
   // Verify behavior of vpx_codec_decode. vpx_codec_decode doesn't even get
   // to decoder_peek_si_internal on frames of size < 8.
   if (data_sz >= 8) {
diff --git a/test/decode_corrupted.cc b/test/decode_corrupted.cc
index 31e1da69c..58773d7b8 100644
--- a/test/decode_corrupted.cc
+++ b/test/decode_corrupted.cc
@@ -28,9 +28,9 @@ class DecodeCorruptedFrameTest
   DecodeCorruptedFrameTest() : EncoderTest(GET_PARAM(0)) {}
 
  protected:
-  virtual ~DecodeCorruptedFrameTest() {}
+  ~DecodeCorruptedFrameTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -44,16 +44,16 @@ class DecodeCorruptedFrameTest
     dec_cfg_.threads = 1;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) encoder->Control(VP8E_SET_CPUUSED, 7);
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {}
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     // Don't edit frame packet on key frame.
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) return pkt;
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
@@ -66,9 +66,9 @@ class DecodeCorruptedFrameTest
     return &modified_pkt_;
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_NE(res_dec, VPX_CODEC_MEM_ERROR) << decoder->DecodeError();
     return VPX_CODEC_MEM_ERROR != res_dec;
   }
diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
index e07a66744..383fd2d89 100644
--- a/test/decode_perf_test.cc
+++ b/test/decode_perf_test.cc
@@ -116,11 +116,11 @@ class VP9NewEncodeDecodePerfTest
  protected:
   VP9NewEncodeDecodePerfTest()
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)), speed_(0),
-        outfile_(0), out_frames_(0) {}
+        outfile_(nullptr), out_frames_(0) {}
 
-  virtual ~VP9NewEncodeDecodePerfTest() {}
+  ~VP9NewEncodeDecodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -137,8 +137,8 @@ class VP9NewEncodeDecodePerfTest
     cfg_.rc_end_usage = VPX_VBR;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, speed_);
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
@@ -146,14 +146,14 @@ class VP9NewEncodeDecodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     const std::string data_path = getenv("LIBVPX_TEST_DATA_PATH");
     const std::string path_to_source = data_path + "/" + kNewEncodeOutputFile;
     outfile_ = fopen(path_to_source.c_str(), "wb");
     ASSERT_NE(outfile_, nullptr);
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (outfile_ != nullptr) {
       if (!fseek(outfile_, 0, SEEK_SET)) {
         ivf_write_file_header(outfile_, &cfg_, VP9_FOURCC, out_frames_);
@@ -163,7 +163,7 @@ class VP9NewEncodeDecodePerfTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -177,7 +177,7 @@ class VP9NewEncodeDecodePerfTest
               pkt->data.frame.sz);
   }
 
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   void set_speed(unsigned int speed) { speed_ = speed; }
 
diff --git a/test/decode_svc_test.cc b/test/decode_svc_test.cc
index ec9935da7..7098e7b27 100644
--- a/test/decode_svc_test.cc
+++ b/test/decode_svc_test.cc
@@ -25,17 +25,16 @@ class DecodeSvcTest : public ::libvpx_test::DecoderTest,
                       public ::libvpx_test::CodecTestWithParam<const char *> {
  protected:
   DecodeSvcTest() : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)) {}
-  virtual ~DecodeSvcTest() {}
+  ~DecodeSvcTest() override = default;
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0)
       decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, spatial_layer_);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_EQ(img.d_w, width_);
     ASSERT_EQ(img.d_h, height_);
     total_frames_ = frame_number;
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index e0e793b15..ee7d8d27c 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -11,17 +11,24 @@
 #include <climits>
 #include <cstring>
 #include <initializer_list>
+#include <new>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/video_source.h"
 
 #include "./vpx_config.h"
-#include "test/video_source.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_tpl.h"
 
 namespace {
 
-const vpx_codec_iface_t *kCodecIfaces[] = {
+vpx_codec_iface_t *kCodecIfaces[] = {
 #if CONFIG_VP8_ENCODER
   &vpx_codec_vp8_cx_algo,
 #endif
@@ -30,7 +37,7 @@ const vpx_codec_iface_t *kCodecIfaces[] = {
 #endif
 };
 
-bool IsVP9(const vpx_codec_iface_t *iface) {
+bool IsVP9(vpx_codec_iface_t *iface) {
   static const char kVP9Name[] = "WebM Project VP9";
   return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
          0;
@@ -118,6 +125,62 @@ TEST(EncodeAPI, ImageSizeSetting) {
 
   vpx_codec_destroy(&enc);
 }
+
+// Verifies the fix for a float-cast-overflow in vp8_change_config().
+//
+// Causes cpi->framerate to become the largest possible value (10,000,000) in
+// VP8 by setting cfg.g_timebase to 1/10000000 and passing a duration of 1 to
+// vpx_codec_encode().
+TEST(EncodeAPI, HugeFramerateVp8) {
+  vpx_codec_iface_t *const iface = vpx_codec_vp8_cx();
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  cfg.g_w = 271;
+  cfg.g_h = 1080;
+  cfg.g_timebase.num = 1;
+  // Largest value (VP8's TICKS_PER_SEC) such that frame duration is nonzero (1
+  // tick).
+  cfg.g_timebase.den = 10000000;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.rc_end_usage = VPX_CBR;
+
+  vpx_codec_ctx_t enc;
+  // Before we encode the first frame, cpi->framerate is set to a guess (the
+  // reciprocal of cfg.g_timebase). If this guess doesn't seem reasonable
+  // (> 180), cpi->framerate is set to 30.
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, -12), VPX_CODEC_OK);
+
+  vpx_image_t *const image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h, 1);
+  ASSERT_NE(image, nullptr);
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  // Encode a frame.
+  // Up to this point cpi->framerate is 30. Now pass a duration of only 1. This
+  // causes cpi->framerate to become 10,000,000.
+  ASSERT_EQ(vpx_codec_encode(&enc, image, 0, 1, 0, VPX_DL_REALTIME),
+            VPX_CODEC_OK);
+
+  // Change to the same config. Since cpi->framerate is now huge, when it is
+  // used to calculate raw_target_rate (bit rate of uncompressed frames), the
+  // result is likely to overflow an unsigned int.
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc, &cfg), VPX_CODEC_OK);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
 #endif
 
 // Set up 2 spatial streams with 2 temporal layers per stream, and generate
@@ -196,7 +259,7 @@ TEST(EncodeAPI, MultiResEncode) {
 
 TEST(EncodeAPI, SetRoi) {
   static struct {
-    const vpx_codec_iface_t *iface;
+    vpx_codec_iface_t *iface;
     int ctrl_id;
   } kCodecs[] = {
 #if CONFIG_VP8_ENCODER
@@ -302,7 +365,7 @@ TEST(EncodeAPI, SetRoi) {
   }
 }
 
-void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
+void InitCodec(vpx_codec_iface_t &iface, int width, int height,
                vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
   cfg->g_w = width;
   cfg->g_h = height;
@@ -392,7 +455,7 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
       EXPECT_EQ(vpx_codec_enc_config_set(&enc.ctx, &cfg), VPX_CODEC_OK)
           << vpx_codec_error_detail(&enc.ctx);
 
-      cfg.g_w = 16;
+      cfg.g_w = 1000;
       cfg.g_h = 720;
 
       for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
@@ -404,4 +467,451 @@ TEST(EncodeAPI, ConfigResizeChangeThreadCount) {
   }
 }
 
+#if CONFIG_VP9_ENCODER
+// Frame size needed to trigger the overflow exceeds the max buffer allowed on
+// 32-bit systems defined by VPX_MAX_ALLOCABLE_MEMORY
+#if VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+TEST(EncodeAPI, ConfigLargeTargetBitrateVp9) {
+  constexpr int kWidth = 12383;
+  constexpr int kHeight = 8192;
+  constexpr auto *iface = &vpx_codec_vp9_cx_algo;
+  SCOPED_TRACE(vpx_codec_iface_name(iface));
+  vpx_codec_enc_cfg_t cfg = {};
+  struct Encoder {
+    ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+    vpx_codec_ctx_t ctx = {};
+  } enc;
+
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, 0), VPX_CODEC_OK);
+  // The following setting will cause avg_frame_bandwidth in rate control to be
+  // larger than INT_MAX
+  cfg.rc_target_bitrate = INT_MAX;
+  // Framerate 0.1 (equivalent to timebase 10) is the smallest framerate allowed
+  // by libvpx
+  cfg.g_timebase.den = 1;
+  cfg.g_timebase.num = 10;
+  EXPECT_NO_FATAL_FAILURE(InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg))
+      << "target bitrate: " << cfg.rc_target_bitrate << " framerate: "
+      << static_cast<double>(cfg.g_timebase.den) / cfg.g_timebase.num;
+}
+#endif  // VPX_ARCH_X86_64 || VPX_ARCH_AARCH64
+
+vpx_image_t *CreateImage(const unsigned int width, const unsigned int height) {
+  vpx_image_t *image =
+      vpx_img_alloc(nullptr, VPX_IMG_FMT_I420, width, height, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+
+  return image;
+}
+
+// Emulates the WebCodecs VideoEncoder interface.
+class VP9Encoder {
+ public:
+  VP9Encoder(int speed) : speed_(speed) {}
+  ~VP9Encoder();
+
+  void Configure(unsigned int threads, unsigned int width, unsigned int height,
+                 vpx_rc_mode end_usage, unsigned long deadline);
+  void Encode(bool key_frame);
+
+ private:
+  const int speed_;
+  bool initialized_ = false;
+  vpx_codec_enc_cfg_t cfg_;
+  vpx_codec_ctx_t enc_;
+  int frame_index_ = 0;
+  unsigned long deadline_ = 0;
+};
+
+VP9Encoder::~VP9Encoder() {
+  if (initialized_) {
+    EXPECT_EQ(vpx_codec_destroy(&enc_), VPX_CODEC_OK);
+  }
+}
+
+void VP9Encoder::Configure(unsigned int threads, unsigned int width,
+                           unsigned int height, vpx_rc_mode end_usage,
+                           unsigned long deadline) {
+  deadline_ = deadline;
+
+  if (!initialized_) {
+    vpx_codec_iface_t *const iface = vpx_codec_vp9_cx();
+    ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg_, /*usage=*/0),
+              VPX_CODEC_OK);
+    cfg_.g_threads = threads;
+    cfg_.g_w = width;
+    cfg_.g_h = height;
+    cfg_.g_timebase.num = 1;
+    cfg_.g_timebase.den = 1000 * 1000;  // microseconds
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = end_usage;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 58;
+    ASSERT_EQ(vpx_codec_enc_init(&enc_, iface, &cfg_, 0), VPX_CODEC_OK);
+    ASSERT_EQ(vpx_codec_control(&enc_, VP8E_SET_CPUUSED, speed_), VPX_CODEC_OK);
+    initialized_ = true;
+    return;
+  }
+
+  cfg_.g_threads = threads;
+  cfg_.g_w = width;
+  cfg_.g_h = height;
+  cfg_.rc_end_usage = end_usage;
+  ASSERT_EQ(vpx_codec_enc_config_set(&enc_, &cfg_), VPX_CODEC_OK)
+      << vpx_codec_error_detail(&enc_);
+}
+
+void VP9Encoder::Encode(bool key_frame) {
+  const vpx_codec_cx_pkt_t *pkt;
+  vpx_image_t *image = CreateImage(cfg_.g_w, cfg_.g_h);
+  ASSERT_NE(image, nullptr);
+  const vpx_enc_frame_flags_t frame_flags = key_frame ? VPX_EFLAG_FORCE_KF : 0;
+  ASSERT_EQ(
+      vpx_codec_encode(&enc_, image, frame_index_, 1, frame_flags, deadline_),
+      VPX_CODEC_OK);
+  frame_index_++;
+  vpx_codec_iter_t iter = nullptr;
+  while ((pkt = vpx_codec_get_cx_data(&enc_, &iter)) != nullptr) {
+    ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+  }
+  vpx_img_free(image);
+}
+
+// This is a test case from clusterfuzz.
+TEST(EncodeAPI, PrevMiCheckNullptr) {
+  VP9Encoder encoder(0);
+  encoder.Configure(0, 1554, 644, VPX_VBR, VPX_DL_REALTIME);
+
+  // First step: encode, without forcing KF.
+  encoder.Encode(false);
+  // Second step: change config
+  encoder.Configure(0, 1131, 644, VPX_CBR, VPX_DL_GOOD_QUALITY);
+  // Third step: encode, without forcing KF
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310477034.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, MultipleChangeConfigResize) {
+  VP9Encoder encoder(3);
+
+  // Set initial config.
+  encoder.Configure(3, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 31, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config again.
+  encoder.Configure(0, 17, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310663186.
+// Encode set of frames while varying the deadline on the fly from
+// good to realtime to best and back to realtime.
+TEST(EncodeAPI, DynamicDeadlineChange) {
+  // Use realtime speed: 5 to 9.
+  VP9Encoder encoder(5);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 4th frame with same config, delta frame.
+  encoder.Encode(false);
+
+  // Encode 5th frame with same config, key frame.
+  encoder.Encode(true);
+
+  // Change config: change deadline to BEST.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_BEST_QUALITY);
+
+  // Encode 6th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 7th frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Encode 8th frame with new config, set key frame.
+  encoder.Encode(true);
+
+  // Encode 9th frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer310340241) {
+  VP9Encoder encoder(-6);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(true);
+
+  // Encode 2nd frame, delta frame.
+  encoder.Encode(false);
+
+  // Change config: change deadline to REALTIME.
+  encoder.Configure(0, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set key frame.
+  encoder.Encode(true);
+}
+
+// This is a test case from clusterfuzz: based on b/312517065.
+TEST(EncodeAPI, Buganizer312517065) {
+  VP9Encoder encoder(4);
+  encoder.Configure(0, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(10, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Encode(false);
+  encoder.Configure(6, 327, 269, VPX_VBR, VPX_DL_GOOD_QUALITY);
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311489136.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311489136) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(3, 1678, 202, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(8, 1037, 476, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+
+  // Change config again.
+  encoder.Configure(0, 580, 620, VPX_CBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with same config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/312656387.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer312656387) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(16, 1, 1024, VPX_CBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1, 1024, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(14, 1, 595, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+
+  // Change config again.
+  encoder.Configure(2, 1, 1024, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/310329177.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer310329177) {
+  VP9Encoder encoder(6);
+
+  // Set initial config.
+  encoder.Configure(10, 41, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config, set delta frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311394513.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311394513) {
+  VP9Encoder encoder(-7);
+
+  // Set initial config.
+  encoder.Configure(0, 5, 9, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 2, 1, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(true);
+}
+
+TEST(EncodeAPI, Buganizer311985118) {
+  VP9Encoder encoder(0);
+
+  // Set initial config, in particular set deadline to GOOD mode.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 1st frame.
+  encoder.Encode(false);
+
+  // Change config: change threads and width.
+  encoder.Configure(0, 1574, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Change config: change threads, width and height.
+  encoder.Configure(16, 837, 432, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame.
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/314857577.
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer314857577) {
+  VP9Encoder encoder(4);
+
+  // Set initial config.
+  encoder.Configure(12, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 1060, 1, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 2nd frame with new config.
+  encoder.Encode(false);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(15, 33, 437, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 4th frame with new config.
+  encoder.Encode(true);
+
+  // Encode 5th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(5, 327, 269, VPX_VBR, VPX_DL_REALTIME);
+
+  // Change config.
+  encoder.Configure(15, 1060, 437, VPX_CBR, VPX_DL_REALTIME);
+
+  // Encode 6th frame with new config.
+  encoder.Encode(false);
+
+  // Encode 7th frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(4, 1060, 437, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode 8th frame with new config.
+  encoder.Encode(false);
+}
+
+TEST(EncodeAPI, Buganizer312875957PredBufferStride) {
+  VP9Encoder encoder(-1);
+
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Encode(false);
+  encoder.Configure(0, 456, 486, VPX_VBR, VPX_DL_REALTIME);
+  encoder.Encode(true);
+  encoder.Configure(0, 1678, 620, VPX_CBR, 1000000);
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+
+// This is a test case from clusterfuzz: based on b/311294795
+// Encode a few frames with multiple change config calls
+// with different frame sizes.
+TEST(EncodeAPI, Buganizer311294795) {
+  VP9Encoder encoder(1);
+
+  // Set initial config.
+  encoder.Configure(12, 1678, 620, VPX_VBR, VPX_DL_REALTIME);
+
+  // Encode first frame.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(16, 632, 620, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 2nd frame with new config
+  encoder.Encode(true);
+
+  // Change config.
+  encoder.Configure(16, 1678, 342, VPX_VBR, VPX_DL_GOOD_QUALITY);
+
+  // Encode 3rd frame with new config.
+  encoder.Encode(false);
+
+  // Change config.
+  encoder.Configure(0, 1574, 618, VPX_VBR, VPX_DL_REALTIME);
+  // Encode more frames with new config.
+  encoder.Encode(false);
+  encoder.Encode(false);
+}
+#endif  // CONFIG_VP9_ENCODER
+
 }  // namespace
diff --git a/test/encode_perf_test.cc b/test/encode_perf_test.cc
index 142a55952..171ff8eec 100644
--- a/test/encode_perf_test.cc
+++ b/test/encode_perf_test.cc
@@ -61,9 +61,9 @@ class VP9EncodePerfTest
       : EncoderTest(GET_PARAM(0)), min_psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)), speed_(0), threads_(1) {}
 
-  virtual ~VP9EncodePerfTest() {}
+  ~VP9EncodePerfTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -82,8 +82,8 @@ class VP9EncodePerfTest
     cfg_.g_threads = threads_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       const int log2_tile_columns = 3;
       encoder->Control(VP8E_SET_CPUUSED, speed_);
@@ -93,19 +93,19 @@ class VP9EncodePerfTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     min_psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < min_psnr_) {
       min_psnr_ = pkt->data.psnr.psnr[0];
     }
   }
 
   // for performance reasons don't decode
-  virtual bool DoDecode() const { return false; }
+  bool DoDecode() const override { return false; }
 
   double min_psnr() const { return min_psnr_; }
 
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index b57df8529..c7974894c 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
 #endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
 
 namespace libvpx_test {
 
@@ -153,6 +153,11 @@ class Encoder {
     const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
   }
+
+  void Control(int ctrl_id, VpxTplGopStats *arg) {
+    const vpx_codec_err_t res = vpx_codec_control_(&encoder_, ctrl_id, arg);
+    ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+  }
 #endif  // CONFIG_VP9_ENCODER
 
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
@@ -259,7 +264,7 @@ class EncoderTest {
 
   const CodecFactory *codec_;
   // Hook to determine whether to decode frame after encoding
-  virtual bool DoDecode() const { return 1; }
+  virtual bool DoDecode() const { return true; }
 
   // Hook to handle encode/decode mismatch
   virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2);
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index 45138f14b..6b019b2bf 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -30,7 +30,7 @@ class ErrorResilienceTestLarge
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLarge() {}
+  ~ErrorResilienceTestLarge() override = default;
 
   void Reset() {
     error_nframes_ = 0;
@@ -38,19 +38,19 @@ class ErrorResilienceTestLarge
     pattern_switch_ = 0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = 0.0;
     nframes_ = 0;
     mismatch_psnr_ = 0.0;
     mismatch_nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
@@ -90,7 +90,7 @@ class ErrorResilienceTestLarge
     return frame_flags;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video) override {
     frame_flags_ &=
         ~(VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF);
     // For temporal layer case.
@@ -129,21 +129,21 @@ class ErrorResilienceTestLarge
     return 0.0;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     if (error_nframes_ > 0 &&
         (cfg_.g_pass == VPX_RC_LAST_PASS || cfg_.g_pass == VPX_RC_ONE_PASS)) {
       for (unsigned int i = 0; i < error_nframes_; ++i) {
         if (error_frames_[i] == nframes_ - 1) {
           std::cout << "             Skipping decoding frame: "
                     << error_frames_[i] << "\n";
-          return 0;
+          return false;
         }
       }
     }
-    return 1;
+    return true;
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
@@ -381,7 +381,7 @@ class ErrorResilienceTestLargeCodecControls
     Reset();
   }
 
-  virtual ~ErrorResilienceTestLargeCodecControls() {}
+  ~ErrorResilienceTestLargeCodecControls() override = default;
 
   void Reset() {
     last_pts_ = 0;
@@ -393,7 +393,7 @@ class ErrorResilienceTestLargeCodecControls
     duration_ = 0.0;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
@@ -460,8 +460,8 @@ class ErrorResilienceTestLargeCodecControls
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (cfg_.ts_number_layers > 1) {
       int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       int frame_flags = SetFrameFlags(video->frame(), cfg_.ts_number_layers);
@@ -476,7 +476,7 @@ class ErrorResilienceTestLargeCodecControls
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
     if (duration > 1) {
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc
index 3bd4a1c47..7b9a836fb 100644
--- a/test/external_frame_buffer_test.cc
+++ b/test/external_frame_buffer_test.cc
@@ -210,13 +210,12 @@ class ExternalFrameBufferMD5Test
       : DecoderTest(GET_PARAM(::libvpx_test::kCodecFactoryParam)),
         md5_file_(nullptr), num_buffers_(0) {}
 
-  virtual ~ExternalFrameBufferMD5Test() {
+  ~ExternalFrameBufferMD5Test() override {
     if (md5_file_ != nullptr) fclose(md5_file_);
   }
 
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (num_buffers_ > 0 && video.frame_number() == 0) {
       // Have libvpx use frame buffers we create.
       ASSERT_TRUE(fb_list_.CreateBufferList(num_buffers_));
@@ -232,8 +231,8 @@ class ExternalFrameBufferMD5Test
         << "Md5 file open failed. Filename: " << md5_file_name_;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
@@ -289,7 +288,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
   ExternalFrameBufferTest()
       : video_(nullptr), decoder_(nullptr), num_buffers_(0) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9TestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
@@ -300,7 +299,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
     ASSERT_NE(decoder_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete decoder_;
     decoder_ = nullptr;
     delete video_;
@@ -355,7 +354,7 @@ class ExternalFrameBufferTest : public ::testing::Test {
 
 class ExternalFrameBufferNonRefTest : public ExternalFrameBufferTest {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     video_ = new libvpx_test::WebMVideoSource(kVP9NonRefTestFile);
     ASSERT_NE(video_, nullptr);
     video_->Init();
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 83d1ff142..3cdf909d4 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -132,9 +132,18 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck().
+// See:
+// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 class FwdTrans8x8TestBase {
  public:
-  virtual ~FwdTrans8x8TestBase() {}
+  virtual ~FwdTrans8x8TestBase() = default;
 
  protected:
   virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
@@ -170,7 +179,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff255;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-255, 255] at index " << j
@@ -201,7 +210,7 @@ class FwdTrans8x8TestBase {
     for (int j = 0; j < 64; ++j) {
       const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
       const int max_diff = kSignBiasMaxDiff15;
-      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
+      ASSERT_LT(diff, max_diff << (bit_depth_ - 8))
           << "Error: 8x8 FDCT/FHT has a sign bias > "
           << 1. * max_diff / count_test_block * 100 << "%"
           << " for input range [-15, 15] at index " << j
@@ -275,11 +284,11 @@ class FwdTrans8x8TestBase {
       }
     }
 
-    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+    ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
         << " roundtrip error > 1";
 
-    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+    ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
         << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
         << "error > 1/5 per block";
   }
@@ -360,17 +369,17 @@ class FwdTrans8x8TestBase {
         total_coeff_error += abs(coeff_diff);
       }
 
-      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
+      ASSERT_GE(1 << 2 * (bit_depth_ - 8), max_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
-          << "an individual roundtrip error > 1";
+          << " an individual roundtrip error > 1";
 
-      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+      ASSERT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
           << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
           << " roundtrip error > 1/5 per block";
 
-      EXPECT_EQ(0, total_coeff_error)
+      ASSERT_EQ(0, total_coeff_error)
           << "Error: Extremal 8x8 FDCT/FHT has"
-          << "overflow issues in the intermediate steps > 1";
+          << " overflow issues in the intermediate steps > 1";
     }
   }
 
@@ -426,7 +435,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - src[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(1u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -456,7 +465,7 @@ class FwdTrans8x8TestBase {
       for (int j = 0; j < kNumCoeffs; ++j) {
         const int32_t diff = coeff[j] - coeff_r[j];
         const uint32_t error = diff * diff;
-        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+        ASSERT_GE(9u << 2 * (bit_depth_ - 8), error)
             << "Error: 8x8 DCT has error " << error << " at index " << j;
       }
     }
@@ -512,7 +521,7 @@ class FwdTrans8x8TestBase {
         const int diff = dst[j] - ref[j];
 #endif
         const uint32_t error = diff * diff;
-        EXPECT_EQ(0u, error)
+        ASSERT_EQ(0u, error)
             << "Error: 8x8 IDCT has error " << error << " at index " << j;
       }
     }
@@ -523,13 +532,16 @@ class FwdTrans8x8TestBase {
   vpx_bit_depth_t bit_depth_;
   int mask_;
 };
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Dct8x8Param> {
  public:
-  virtual ~FwdTrans8x8DCT() {}
+  ~FwdTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -539,13 +551,13 @@ class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
 
@@ -566,9 +578,9 @@ TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
 class FwdTrans8x8HT : public FwdTrans8x8TestBase,
                       public ::testing::TestWithParam<Ht8x8Param> {
  public:
-  virtual ~FwdTrans8x8HT() {}
+  ~FwdTrans8x8HT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     fwd_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     tx_type_ = GET_PARAM(2);
@@ -578,13 +590,13 @@ class FwdTrans8x8HT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) override {
     fwd_txfm_(in, out, stride, tx_type_);
   }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride, tx_type_);
   }
 
@@ -602,9 +614,9 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) { RunExtremalCheck(); }
 class InvTrans8x8DCT : public FwdTrans8x8TestBase,
                        public ::testing::TestWithParam<Idct8x8Param> {
  public:
-  virtual ~InvTrans8x8DCT() {}
+  ~InvTrans8x8DCT() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     ref_txfm_ = GET_PARAM(0);
     inv_txfm_ = GET_PARAM(1);
     thresh_ = GET_PARAM(2);
@@ -613,13 +625,14 @@ class InvTrans8x8DCT : public FwdTrans8x8TestBase,
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) override {
     inv_txfm_(out, dst, stride);
   }
-  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/, int /*stride*/) {}
+  void RunFwdTxfm(int16_t * /*out*/, tran_low_t * /*dst*/,
+                  int /*stride*/) override {}
 
   IdctFunc ref_txfm_;
   IdctFunc inv_txfm_;
diff --git a/test/frame_size_tests.cc b/test/frame_size_tests.cc
index 8a0eb71ba..7b6c29a88 100644
--- a/test/frame_size_tests.cc
+++ b/test/frame_size_tests.cc
@@ -65,7 +65,7 @@ class EncoderWithExpectedError : public ::libvpx_test::Encoder {
     ASSERT_EQ(expected_err, res) << EncoderError();
   }
 
-  virtual vpx_codec_iface_t *CodecInterface() const {
+  vpx_codec_iface_t *CodecInterface() const override {
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
@@ -79,22 +79,22 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
  protected:
   VP9FrameSizeTestsLarge()
       : EncoderTest(&::libvpx_test::kVP9), expected_res_(VPX_CODEC_OK) {}
-  virtual ~VP9FrameSizeTestsLarge() {}
+  ~VP9FrameSizeTestsLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_EQ(expected_res_, res_dec) << decoder->DecodeError();
     return !::testing::Test::HasFailure();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
diff --git a/test/hadamard_test.cc b/test/hadamard_test.cc
index f904e814a..b22bae87c 100644
--- a/test/hadamard_test.cc
+++ b/test/hadamard_test.cc
@@ -130,13 +130,19 @@ std::ostream &operator<<(std::ostream &os, const HadamardFuncWithSize &hfs) {
 
 class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     h_func_ = GetParam().func;
     bwh_ = GetParam().block_size;
     block_size_ = bwh_ * bwh_;
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  // The Rand() function generates values in the range [-((1 << BitDepth) - 1),
+  // (1 << BitDepth) - 1]. This is because the input to the Hadamard transform
+  // is the residual pixel, which is defined as 'source pixel - predicted
+  // pixel'. Source pixel and predicted pixel take values in the range
+  // [0, (1 << BitDepth) - 1] and thus the residual pixel ranges from
+  // -((1 << BitDepth) - 1) to ((1 << BitDepth) - 1).
   virtual int16_t Rand() = 0;
 
   void ReferenceHadamard(const int16_t *a, int a_stride, tran_low_t *b,
@@ -170,6 +176,31 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
     EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
   }
 
+  void ExtremeValuesTest() {
+    const int kMaxBlockSize = 32 * 32;
+    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kMaxBlockSize]);
+    DECLARE_ALIGNED(16, tran_low_t, b[kMaxBlockSize]);
+    memset(b, 0, sizeof(b));
+
+    tran_low_t b_ref[kMaxBlockSize];
+    memset(b_ref, 0, sizeof(b_ref));
+
+    for (int i = 0; i < 2; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      const int sign = (i == 0) ? 1 : -1;
+      for (int j = 0; j < kMaxBlockSize; ++j)
+        input_extreme_block[j] = sign * 255;
+
+      ReferenceHadamard(input_extreme_block, bwh_, b_ref, bwh_);
+      ASM_REGISTER_STATE_CHECK(h_func_(input_extreme_block, bwh_, b));
+
+      // The order of the output is not important. Sort before checking.
+      std::sort(b, b + block_size_);
+      std::sort(b_ref, b_ref + block_size_);
+      EXPECT_EQ(0, memcmp(b, b_ref, sizeof(b)));
+    }
+  }
+
   void VaryStride() {
     const int kMaxBlockSize = 32 * 32;
     DECLARE_ALIGNED(16, int16_t, a[kMaxBlockSize * 8]);
@@ -220,11 +251,18 @@ class HadamardTestBase : public ::testing::TestWithParam<HadamardFuncWithSize> {
 
 class HadamardLowbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand9Signed(); }
+  // Use values between -255 (0xFF01) and 255 (0x00FF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand8();
+    int16_t pred = rnd_.Rand8();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardLowbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
 
+TEST_P(HadamardLowbdTest, ExtremeValuesTest) { ExtremeValuesTest(); }
+
 TEST_P(HadamardLowbdTest, VaryStride) { VaryStride(); }
 
 TEST_P(HadamardLowbdTest, DISABLED_Speed) {
@@ -296,7 +334,12 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
-  virtual int16_t Rand() { return rnd_.Rand13Signed(); }
+  // Use values between -4095 (0xF001) and 4095 (0x0FFF)
+  int16_t Rand() override {
+    int16_t src = rnd_.Rand12();
+    int16_t pred = rnd_.Rand12();
+    return src - pred;
+  }
 };
 
 TEST_P(HadamardHighbdTest, CompareReferenceRandom) { CompareReferenceRandom(); }
@@ -324,5 +367,14 @@ INSTANTIATE_TEST_SUITE_P(
                                            32)));
 #endif  // HAVE_AVX2
 
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(
+    NEON, HadamardHighbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_highbd_hadamard_8x8_neon, 8),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_highbd_hadamard_32x32_neon,
+                                           32)));
+#endif
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/idct_test.cc b/test/idct_test.cc
index 1b9532e1c..279e58e2a 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc
@@ -27,7 +27,7 @@ using libvpx_test::Buffer;
 
 class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     UUT = GetParam();
 
     input = new Buffer<int16_t>(4, 4, 0);
@@ -41,7 +41,7 @@ class IDCTTest : public ::testing::TestWithParam<IdctFunc> {
     ASSERT_TRUE(output->Init());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete input;
     delete predict;
     delete output;
diff --git a/test/init_vpx_test.cc b/test/init_vpx_test.cc
new file mode 100644
index 000000000..f66f00b5c
--- /dev/null
+++ b/test/init_vpx_test.cc
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/init_vpx_test.h"
+
+#include "./vpx_config.h"
+
+#if !CONFIG_SHARED
+#include <string>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#if VPX_ARCH_ARM
+#include "vpx_ports/arm.h"
+#endif
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+#include "vpx_ports/x86.h"
+#endif
+extern "C" {
+#if CONFIG_VP8
+extern void vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+extern void vp9_rtcd();
+#endif  // CONFIG_VP9
+extern void vpx_dsp_rtcd();
+extern void vpx_scale_rtcd();
+}
+
+#if VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+static void append_negative_gtest_filter(const char *str) {
+  std::string filter = GTEST_FLAG_GET(filter);
+  // Negative patterns begin with one '-' followed by a ':' separated list.
+  if (filter.find('-') == std::string::npos) filter += '-';
+  filter += str;
+  GTEST_FLAG_SET(filter, filter);
+}
+#endif  // VPX_ARCH_ARM || VPX_ARCH_X86 || VPX_ARCH_X86_64
+#endif  // !CONFIG_SHARED
+
+namespace libvpx_test {
+void init_vpx_test() {
+#if !CONFIG_SHARED
+#if VPX_ARCH_AARCH64
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON_DOTPROD)) {
+    append_negative_gtest_filter(":NEON_DOTPROD.*:NEON_DOTPROD/*");
+  }
+  if (!(caps & HAS_NEON_I8MM)) {
+    append_negative_gtest_filter(":NEON_I8MM.*:NEON_I8MM/*");
+  }
+  if (!(caps & HAS_SVE)) {
+    append_negative_gtest_filter(":SVE.*:SVE/*");
+  }
+#elif VPX_ARCH_ARM
+  const int caps = arm_cpu_caps();
+  if (!(caps & HAS_NEON)) append_negative_gtest_filter(":NEON.*:NEON/*");
+#endif  // VPX_ARCH_ARM
+
+#if VPX_ARCH_X86 || VPX_ARCH_X86_64
+  const int simd_caps = x86_simd_caps();
+  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
+  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
+  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
+  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
+  if (!(simd_caps & HAS_SSSE3)) {
+    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
+  }
+  if (!(simd_caps & HAS_SSE4_1)) {
+    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
+  }
+  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
+  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
+  if (!(simd_caps & HAS_AVX512)) {
+    append_negative_gtest_filter(":AVX512.*:AVX512/*");
+  }
+#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
+
+  // Shared library builds don't support whitebox tests that exercise internal
+  // symbols.
+#if CONFIG_VP8
+  vp8_rtcd();
+#endif  // CONFIG_VP8
+#if CONFIG_VP9
+  vp9_rtcd();
+#endif  // CONFIG_VP9
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
+}
+}  // namespace libvpx_test
diff --git a/test/init_vpx_test.h b/test/init_vpx_test.h
new file mode 100644
index 000000000..5e0dbb0e7
--- /dev/null
+++ b/test/init_vpx_test.h
@@ -0,0 +1,18 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_INIT_VPX_TEST_H_
+#define TEST_INIT_VPX_TEST_H_
+
+namespace libvpx_test {
+void init_vpx_test();
+}
+
+#endif  // TEST_INIT_VPX_TEST_H_
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc
index 762d585f5..c37dc0d48 100644
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -40,7 +40,7 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
  protected:
   InvalidFileTest() : DecoderTest(GET_PARAM(0)), res_file_(nullptr) {}
 
-  virtual ~InvalidFileTest() {
+  ~InvalidFileTest() override {
     if (res_file_ != nullptr) fclose(res_file_);
   }
 
@@ -50,10 +50,9 @@ class InvalidFileTest : public ::libvpx_test::DecoderTest,
         << "Result file open failed. Filename: " << res_file_name_;
   }
 
-  virtual bool HandleDecodeResult(
-      const vpx_codec_err_t res_dec,
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     EXPECT_NE(res_file_, nullptr);
     int expected_res_dec;
 
@@ -172,9 +171,9 @@ VP9_INSTANTIATE_TEST_SUITE(InvalidFileTest,
 class InvalidFileInvalidPeekTest : public InvalidFileTest {
  protected:
   InvalidFileInvalidPeekTest() : InvalidFileTest() {}
-  virtual void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
-                                libvpx_test::CompressedVideoSource * /*video*/,
-                                const vpx_codec_err_t /*res_peek*/) {}
+  void HandlePeekResult(libvpx_test::Decoder *const /*decoder*/,
+                        libvpx_test::CompressedVideoSource * /*video*/,
+                        const vpx_codec_err_t /*res_peek*/) override {}
 };
 
 TEST_P(InvalidFileInvalidPeekTest, ReturnCode) { RunTest(); }
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index a8ac4f154..3ccac62b5 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -33,19 +33,19 @@ class IVFVideoSource : public CompressedVideoSource {
         compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~IVFVideoSource() {
+  ~IVFVideoSource() override {
     delete[] compressed_frame_buf_;
 
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Init() {
+  void Init() override {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
     ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
-  virtual void Begin() {
+  void Begin() override {
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
@@ -62,7 +62,7 @@ class IVFVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
@@ -86,11 +86,11 @@ class IVFVideoSource : public CompressedVideoSource {
     }
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
-  virtual size_t frame_size() const { return frame_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return frame_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/keyframe_test.cc b/test/keyframe_test.cc
index a13dec9ce..5292bb188 100644
--- a/test/keyframe_test.cc
+++ b/test/keyframe_test.cc
@@ -8,12 +8,18 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include <climits>
+#include <cstring>
 #include <vector>
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
 #include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_image.h"
 
 namespace {
 
@@ -22,9 +28,9 @@ class KeyframeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   KeyframeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~KeyframeTest() {}
+  ~KeyframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     kf_count_ = 0;
@@ -33,8 +39,8 @@ class KeyframeTest
     set_cpu_used_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (kf_do_force_kf_) {
       frame_flags_ = (video->frame() % 3) ? 0 : VPX_EFLAG_FORCE_KF;
     }
@@ -43,7 +49,7 @@ class KeyframeTest
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.frame.flags & VPX_FRAME_IS_KEY) {
       kf_pts_list_.push_back(pkt->data.frame.pts);
       kf_count_++;
@@ -146,4 +152,105 @@ TEST_P(KeyframeTest, TestAutoKeyframe) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(KeyframeTest, ALL_TEST_MODES);
+
+bool IsVP9(vpx_codec_iface_t *iface) {
+  static const char kVP9Name[] = "WebM Project VP9";
+  return strncmp(kVP9Name, vpx_codec_iface_name(iface), sizeof(kVP9Name) - 1) ==
+         0;
+}
+
+vpx_image_t *CreateGrayImage(vpx_img_fmt_t fmt, unsigned int w,
+                             unsigned int h) {
+  vpx_image_t *const image = vpx_img_alloc(nullptr, fmt, w, h, 1);
+  if (!image) return image;
+
+  for (unsigned int i = 0; i < image->d_h; ++i) {
+    memset(image->planes[0] + i * image->stride[0], 128, image->d_w);
+  }
+  const unsigned int uv_h = (image->d_h + 1) / 2;
+  const unsigned int uv_w = (image->d_w + 1) / 2;
+  for (unsigned int i = 0; i < uv_h; ++i) {
+    memset(image->planes[1] + i * image->stride[1], 128, uv_w);
+    memset(image->planes[2] + i * image->stride[2], 128, uv_w);
+  }
+  return image;
+}
+
+// Tests kf_max_dist in one-pass encoding with zero lag.
+void TestKeyframeMaximumInterval(vpx_codec_iface_t *iface,
+                                 unsigned long deadline,
+                                 unsigned int kf_max_dist) {
+  vpx_codec_enc_cfg_t cfg;
+  ASSERT_EQ(vpx_codec_enc_config_default(iface, &cfg, /*usage=*/0),
+            VPX_CODEC_OK);
+  cfg.g_w = 320;
+  cfg.g_h = 240;
+  cfg.g_pass = VPX_RC_ONE_PASS;
+  cfg.g_lag_in_frames = 0;
+  cfg.kf_mode = VPX_KF_AUTO;
+  cfg.kf_min_dist = 0;
+  cfg.kf_max_dist = kf_max_dist;
+
+  vpx_codec_ctx_t enc;
+  ASSERT_EQ(vpx_codec_enc_init(&enc, iface, &cfg, 0), VPX_CODEC_OK);
+
+  const int speed = IsVP9(iface) ? 9 : -12;
+  ASSERT_EQ(vpx_codec_control(&enc, VP8E_SET_CPUUSED, speed), VPX_CODEC_OK);
+
+  vpx_image_t *image = CreateGrayImage(VPX_IMG_FMT_I420, cfg.g_w, cfg.g_h);
+  ASSERT_NE(image, nullptr);
+
+  // Encode frames.
+  const vpx_codec_cx_pkt_t *pkt;
+  const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1;
+  for (unsigned int i = 0; i < num_frames; ++i) {
+    ASSERT_EQ(vpx_codec_encode(&enc, image, i, 1, 0, deadline), VPX_CODEC_OK);
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      if (kf_max_dist == 0 || i % kf_max_dist == 0) {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, VPX_FRAME_IS_KEY);
+      } else {
+        ASSERT_EQ(pkt->data.frame.flags & VPX_FRAME_IS_KEY, 0u);
+      }
+    }
+  }
+
+  // Flush the encoder.
+  bool got_data;
+  do {
+    ASSERT_EQ(vpx_codec_encode(&enc, nullptr, 0, 1, 0, deadline), VPX_CODEC_OK);
+    got_data = false;
+    vpx_codec_iter_t iter = nullptr;
+    while ((pkt = vpx_codec_get_cx_data(&enc, &iter)) != nullptr) {
+      ASSERT_EQ(pkt->kind, VPX_CODEC_CX_FRAME_PKT);
+      got_data = true;
+    }
+  } while (got_data);
+
+  vpx_img_free(image);
+  ASSERT_EQ(vpx_codec_destroy(&enc), VPX_CODEC_OK);
+}
+
+TEST(KeyframeIntervalTest, KeyframeMaximumInterval) {
+  std::vector<vpx_codec_iface_t *> ifaces;
+#if CONFIG_VP8_ENCODER
+  ifaces.push_back(vpx_codec_vp8_cx());
+#endif
+#if CONFIG_VP9_ENCODER
+  ifaces.push_back(vpx_codec_vp9_cx());
+#endif
+  for (vpx_codec_iface_t *iface : ifaces) {
+    for (unsigned long deadline :
+         { VPX_DL_REALTIME, VPX_DL_GOOD_QUALITY, VPX_DL_BEST_QUALITY }) {
+      // Test 0 and 1 (both mean all intra), some powers of 2, some multiples
+      // of 10, and some prime numbers.
+      for (unsigned int kf_max_dist :
+           { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) {
+        TestKeyframeMaximumInterval(iface, deadline, kf_max_dist);
+      }
+    }
+  }
+}
+
 }  // namespace
diff --git a/test/level_test.cc b/test/level_test.cc
index 038d75f44..36cfd645c 100644
--- a/test/level_test.cc
+++ b/test/level_test.cc
@@ -22,9 +22,9 @@ class LevelTest
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), min_gf_internal_(24), target_level_(0),
         level_(0) {}
-  virtual ~LevelTest() {}
+  ~LevelTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -41,8 +41,8 @@ class LevelTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_SET_TARGET_LEVEL, target_level_);
@@ -120,7 +120,7 @@ TEST_P(LevelTest, TestTargetLevel255) {
 
 TEST_P(LevelTest, TestTargetLevelApi) {
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0, 1);
-  static const vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
+  static vpx_codec_iface_t *codec = &vpx_codec_vp9_cx_algo;
   vpx_codec_ctx_t enc;
   vpx_codec_enc_cfg_t cfg;
   EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(codec, &cfg, 0));
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 4cc99a6db..ce0ddeae1 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -129,15 +129,15 @@ uint8_t GetHevThresh(ACMRandom *rnd) {
 
 class Loop8Test6Param : public ::testing::TestWithParam<loop8_param_t> {
  public:
-  virtual ~Loop8Test6Param() {}
-  virtual void SetUp() {
+  ~Loop8Test6Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
@@ -151,15 +151,15 @@ GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
     (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
-  virtual ~Loop8Test9Param() {}
-  virtual void SetUp() {
+  ~Loop8Test9Param() override = default;
+  void SetUp() override {
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
     mask_ = (1 << bit_depth_) - 1;
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int bit_depth_;
diff --git a/test/minmax_test.cc b/test/minmax_test.cc
index 12327bc18..b49570906 100644
--- a/test/minmax_test.cc
+++ b/test/minmax_test.cc
@@ -15,6 +15,7 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_mem/vpx_mem.h"
 
 #include "test/acm_random.h"
 #include "test/register_state_check.h"
@@ -28,7 +29,7 @@ typedef void (*MinMaxFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
 
 class MinMaxTest : public ::testing::TestWithParam<MinMaxFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     mm_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
@@ -115,7 +116,115 @@ TEST_P(MinMaxTest, CompareReferenceAndVaryStride) {
   }
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+using HBDMinMaxTest = MinMaxTest;
+
+void highbd_reference_minmax(const uint8_t *a, int a_stride, const uint8_t *b,
+                             int b_stride, int *min_ret, int *max_ret) {
+  int min = 65535;
+  int max = 0;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b);
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      const int diff = abs(a_ptr[i * a_stride + j] - b_ptr[i * b_stride + j]);
+      if (min > diff) min = diff;
+      if (max < diff) max = diff;
+    }
+  }
+
+  *min_ret = min;
+  *max_ret = max;
+}
+
+TEST_P(HBDMinMaxTest, MinValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 65535, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(65535, max);
+    EXPECT_EQ(i, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, MaxValue) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int i = 0; i < 64; i++) {
+    vpx_memset16(CONVERT_TO_SHORTPTR(a), 0, 64);
+    vpx_memset16(CONVERT_TO_SHORTPTR(b), 0, 64);
+    CONVERT_TO_SHORTPTR(b)[i] = i;  // Set a minimum difference of i.
+
+    int min, max;
+    ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+    EXPECT_EQ(i, max);
+    EXPECT_EQ(0, min);
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+
+TEST_P(HBDMinMaxTest, CompareReference) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc(64 * sizeof(uint16_t))));
+  for (int j = 0; j < 64; j++) {
+    CONVERT_TO_SHORTPTR(a)[j] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[j] = rnd_.Rand16();
+  }
+
+  int min_ref, max_ref, min, max;
+  highbd_reference_minmax(a, 8, b, 8, &min_ref, &max_ref);
+  ASM_REGISTER_STATE_CHECK(mm_func_(a, 8, b, 8, &min, &max));
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+  EXPECT_EQ(max_ref, max);
+  EXPECT_EQ(min_ref, min);
+}
+
+TEST_P(HBDMinMaxTest, CompareReferenceAndVaryStride) {
+  uint8_t *a = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  uint8_t *b = CONVERT_TO_BYTEPTR(
+      reinterpret_cast<uint16_t *>(vpx_malloc((8 * 64) * sizeof(uint16_t))));
+  for (int i = 0; i < 8 * 64; i++) {
+    CONVERT_TO_SHORTPTR(a)[i] = rnd_.Rand16();
+    CONVERT_TO_SHORTPTR(b)[i] = rnd_.Rand16();
+  }
+  for (int a_stride = 8; a_stride <= 64; a_stride += 8) {
+    for (int b_stride = 8; b_stride <= 64; b_stride += 8) {
+      int min_ref, max_ref, min, max;
+      highbd_reference_minmax(a, a_stride, b, b_stride, &min_ref, &max_ref);
+      ASM_REGISTER_STATE_CHECK(mm_func_(a, a_stride, b, b_stride, &min, &max));
+      EXPECT_EQ(max_ref, max)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+      EXPECT_EQ(min_ref, min)
+          << "when a_stride = " << a_stride << " and b_stride = " << b_stride;
+    }
+  }
+  vpx_free(CONVERT_TO_SHORTPTR(a));
+  vpx_free(CONVERT_TO_SHORTPTR(b));
+}
+#endif
+
 INSTANTIATE_TEST_SUITE_P(C, MinMaxTest, ::testing::Values(&vpx_minmax_8x8_c));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(C, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_c));
+#endif
 
 #if HAVE_SSE2
 INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
@@ -125,6 +234,10 @@ INSTANTIATE_TEST_SUITE_P(SSE2, MinMaxTest,
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, MinMaxTest,
                          ::testing::Values(&vpx_minmax_8x8_neon));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(NEON, HBDMinMaxTest,
+                         ::testing::Values(&vpx_highbd_minmax_8x8_neon));
+#endif
 #endif
 
 #if HAVE_MSA
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index 7eb888a58..01e63eb69 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -59,8 +59,8 @@ const int kCountTestBlock = 1000;
 
 class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
  public:
-  virtual ~PartialIDctTest() {}
-  virtual void SetUp() {
+  ~PartialIDctTest() override = default;
+  void SetUp() override {
     rnd_.Reset(ACMRandom::DeterministicSeed());
     fwd_txfm_ = GET_PARAM(0);
     full_inv_txfm_ = GET_PARAM(1);
@@ -76,7 +76,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
       case TX_8X8: size_ = 8; break;
       case TX_16X16: size_ = 16; break;
       case TX_32X32: size_ = 32; break;
-      default: FAIL() << "Wrong Size!"; break;
+      default: FAIL() << "Wrong Size!";
     }
 
     // Randomize stride_ to a value less than or equal to 1024
@@ -100,7 +100,7 @@ class PartialIDctTest : public ::testing::TestWithParam<PartialInvTxfmParam> {
         vpx_memalign(16, pixel_size_ * output_block_size_));
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(input_block_);
     input_block_ = nullptr;
     vpx_free(output_block_);
diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc
index 27d5ffa90..d2db8a7c7 100644
--- a/test/pp_filter_test.cc
+++ b/test/pp_filter_test.cc
@@ -51,10 +51,10 @@ class VpxPostProcDownAndAcrossMbRowTest
  public:
   VpxPostProcDownAndAcrossMbRowTest()
       : mb_post_proc_down_and_across_(GetParam()) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   const VpxPostProcDownAndAcrossMbRowFunc mb_post_proc_down_and_across_;
   // Size of the underlying data block that will be filtered.
@@ -227,10 +227,10 @@ class VpxMbPostProcAcrossIpTest
   VpxMbPostProcAcrossIpTest()
       : rows_(16), cols_(16), mb_post_proc_across_ip_(GetParam()),
         src_(Buffer<uint8_t>(rows_, cols_, 8, 8, 17, 8)) {}
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetCols(unsigned char *s, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
@@ -356,10 +356,10 @@ class VpxMbPostProcDownTest
       : rows_(16), cols_(16), mb_post_proc_down_(GetParam()),
         src_c_(Buffer<uint8_t>(rows_, cols_, 8, 8, 8, 17)) {}
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run();
+  void Run() override;
 
   void SetRows(unsigned char *src_c, int rows, int cols, int src_width) {
     for (int r = 0; r < rows; r++) {
diff --git a/test/predict_test.cc b/test/predict_test.cc
index 747297057..474eab2cb 100644
--- a/test/predict_test.cc
+++ b/test/predict_test.cc
@@ -43,7 +43,7 @@ class PredictTestBase : public AbstractBench,
       : width_(GET_PARAM(0)), height_(GET_PARAM(1)), predict_(GET_PARAM(2)),
         src_(nullptr), padded_dst_(nullptr), dst_(nullptr), dst_c_(nullptr) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     src_ = new uint8_t[kSrcSize];
     ASSERT_NE(src_, nullptr);
 
@@ -64,7 +64,7 @@ class PredictTestBase : public AbstractBench,
     memset(dst_c_, 0, 16 * 16);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     delete[] src_;
     src_ = nullptr;
     vpx_free(padded_dst_);
@@ -209,7 +209,7 @@ class PredictTestBase : public AbstractBench,
     }
   }
 
-  void Run() {
+  void Run() override {
     for (int xoffset = 0; xoffset < 8; ++xoffset) {
       for (int yoffset = 0; yoffset < 8; ++yoffset) {
         if (xoffset == 0 && yoffset == 0) {
@@ -350,6 +350,14 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(4, 4, &vp8_sixtap_predict4x4_mmi)));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, SixtapPredictTest,
+    ::testing::Values(make_tuple(16, 16, &vp8_sixtap_predict16x16_lsx),
+                      make_tuple(8, 8, &vp8_sixtap_predict8x8_lsx),
+                      make_tuple(4, 4, &vp8_sixtap_predict4x4_lsx)));
+#endif
+
 class BilinearPredictTest : public PredictTestBase {};
 
 TEST_P(BilinearPredictTest, TestWithRandomData) {
diff --git a/test/quantize_test.cc b/test/quantize_test.cc
index 57309e810..ab38f5c1b 100644
--- a/test/quantize_test.cc
+++ b/test/quantize_test.cc
@@ -121,13 +121,13 @@ class QuantizeTest : public QuantizeTestBase,
                      public ::testing::TestWithParam<VP8QuantizeParam>,
                      public AbstractBench {
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     SetupCompressor();
     asm_quant_ = GET_PARAM(0);
     c_quant_ = GET_PARAM(1);
   }
 
-  virtual void Run() {
+  void Run() override {
     asm_quant_(&vp8_comp_->mb.block[0], &macroblockd_dst_->block[0]);
   }
 
diff --git a/test/realtime_test.cc b/test/realtime_test.cc
index c5de2dcb3..a9870b3cb 100644
--- a/test/realtime_test.cc
+++ b/test/realtime_test.cc
@@ -26,7 +26,7 @@ class RealtimeTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   RealtimeTest() : EncoderTest(GET_PARAM(0)), frame_packets_(0) {}
-  ~RealtimeTest() override {}
+  ~RealtimeTest() override = default;
 
   void SetUp() override {
     InitializeConfig();
@@ -95,7 +95,7 @@ TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
-#if VPX_ARCH_X86_64
+#if VPX_ARCH_AARCH64 || VPX_ARCH_X86_64
     TestIntegerOverflow(16384, 16384);
 #else
     TestIntegerOverflow(4096, 4096);
diff --git a/test/register_state_check.h b/test/register_state_check.h
index 0b837dd04..ede86ef52 100644
--- a/test/register_state_check.h
+++ b/test/register_state_check.h
@@ -184,13 +184,13 @@ class RegisterStateCheckMMX {
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)         \
-  do {                                              \
-    {                                               \
-      libvpx_test::RegisterStateCheckMMX reg_check; \
-      ASM_REGISTER_STATE_CHECK(statement);          \
-    }                                               \
-    __asm__ volatile("" ::: "memory");              \
+#define API_REGISTER_STATE_CHECK(statement)             \
+  do {                                                  \
+    {                                                   \
+      libvpx_test::RegisterStateCheckMMX reg_check_mmx; \
+      ASM_REGISTER_STATE_CHECK(statement);              \
+    }                                                   \
+    __asm__ volatile("" ::: "memory");                  \
   } while (false)
 
 }  // namespace libvpx_test
diff --git a/test/resize_test.cc b/test/resize_test.cc
index d9420a454..20ad2229b 100644
--- a/test/resize_test.cc
+++ b/test/resize_test.cc
@@ -244,10 +244,10 @@ class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
   }
   bool flag_codec_;
   bool smaller_width_larger_size_;
-  virtual ~ResizingVideoSource() {}
+  ~ResizingVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     unsigned int width = 0;
     unsigned int height = 0;
@@ -264,14 +264,14 @@ class ResizeTest
  protected:
   ResizeTest() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~ResizeTest() {}
+  ~ResizeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -286,8 +286,8 @@ class ResizeTest
     return encode_frame_height_[idx];
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
@@ -333,15 +333,15 @@ class ResizeInternalTest : public ResizeTest {
   ResizeInternalTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeInternalTest() {}
+  ~ResizeInternalTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp90-2-05-resize.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -352,8 +352,8 @@ class ResizeInternalTest : public ResizeTest {
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (change_config_) {
       int new_q = 60;
       if (video->frame() == 0) {
@@ -378,13 +378,13 @@ class ResizeInternalTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -447,10 +447,10 @@ class ResizeRealtimeTest
       public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
  protected:
   ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~ResizeRealtimeTest() {}
+  ~ResizeRealtimeTest() override = default;
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_AQ_MODE, 3);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -463,24 +463,24 @@ class ResizeRealtimeTest
     }
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t pts) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t pts) override {
     frame_info_list_.push_back(FrameInfo(pts, img.d_w, img.d_h));
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     double mismatch_psnr = compute_psnr(img1, img2);
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ASSERT_NE(static_cast<int>(pkt->data.frame.width[0]), 0);
     ASSERT_NE(static_cast<int>(pkt->data.frame.height[0]), 0);
     encode_frame_width_.push_back(pkt->data.frame.width[0]);
@@ -688,15 +688,15 @@ class ResizeCspTest : public ResizeTest {
   ResizeCspTest() : ResizeTest(), frame0_psnr_(0.0) {}
 #endif
 
-  virtual ~ResizeCspTest() {}
+  ~ResizeCspTest() override = default;
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
 #if WRITE_COMPRESSED_STREAM
     outfile_ = fopen("vp91-2-05-cspchape.ivf", "wb");
 #endif
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
 #if WRITE_COMPRESSED_STREAM
     if (outfile_) {
       if (!fseek(outfile_, 0, SEEK_SET))
@@ -707,8 +707,8 @@ class ResizeCspTest : public ResizeTest {
 #endif
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (CspForFrameNumber(video->frame()) != VPX_IMG_FMT_I420 &&
         cfg_.g_profile != 1) {
       cfg_.g_profile = 1;
@@ -721,13 +721,13 @@ class ResizeCspTest : public ResizeTest {
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (frame0_psnr_ == 0.) frame0_psnr_ = pkt->data.psnr.psnr[0];
     EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0);
   }
 
 #if WRITE_COMPRESSED_STREAM
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     ++out_frames_;
 
     // Write initial file header if first frame.
@@ -753,10 +753,10 @@ class ResizingCspVideoSource : public ::libvpx_test::DummyVideoSource {
     limit_ = 30;
   }
 
-  virtual ~ResizingCspVideoSource() {}
+  ~ResizingCspVideoSource() override = default;
 
  protected:
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     SetImageFormat(CspForFrameNumber(frame_));
     FillFrame();
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 0896c77f1..3530e6605 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -42,6 +42,10 @@ typedef unsigned int (*SadMxNFunc)(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride);
 typedef TestParams<SadMxNFunc> SadMxNParam;
 
+typedef unsigned int (*SadSkipMxNFunc)(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride);
+typedef TestParams<SadSkipMxNFunc> SadSkipMxNParam;
+
 typedef unsigned int (*SadMxNAvgFunc)(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride,
                                       const uint8_t *second_pred);
@@ -52,6 +56,11 @@ typedef void (*SadMxNx4Func)(const uint8_t *src_ptr, int src_stride,
                              unsigned int *sad_array);
 typedef TestParams<SadMxNx4Func> SadMxNx4Param;
 
+typedef void (*SadSkipMxNx4Func)(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_ptr[], int ref_stride,
+                                 unsigned int *sad_array);
+typedef TestParams<SadSkipMxNx4Func> SadSkipMxNx4Param;
+
 typedef void (*SadMxNx8Func)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
@@ -64,7 +73,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
  public:
   explicit SADTestBase(const ParamType &params) : params_(params) {}
 
-  virtual void SetUp() {
+  void SetUp() override {
     source_data8_ = reinterpret_cast<uint8_t *>(
         vpx_memalign(kDataAlignment, kDataBlockSize));
     reference_data8_ = reinterpret_cast<uint8_t *>(
@@ -99,7 +108,7 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(source_data8_);
     source_data8_ = nullptr;
     vpx_free(reference_data8_);
@@ -170,6 +179,34 @@ class SADTestBase : public ::testing::TestWithParam<ParamType> {
     return sad;
   }
 
+  // Sum of Absolute Differences Skip rows. Given two blocks, calculate the
+  // absolute difference between two pixels in the same relative location every
+  // other row; accumulate and double the result at the end.
+  uint32_t ReferenceSADSkip(int ref_offset) const {
+    uint32_t sad = 0;
+    const uint8_t *const reference8 = GetReferenceFromOffset(ref_offset);
+    const uint8_t *const source8 = source_data_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const uint16_t *const reference16 =
+        CONVERT_TO_SHORTPTR(GetReferenceFromOffset(ref_offset));
+    const uint16_t *const source16 = CONVERT_TO_SHORTPTR(source_data_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    for (int h = 0; h < params_.height; h += 2) {
+      for (int w = 0; w < params_.width; ++w) {
+        if (!use_high_bit_depth_) {
+          sad += abs(source8[h * source_stride_ + w] -
+                     reference8[h * reference_stride_ + w]);
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          sad += abs(source16[h * source_stride_ + w] -
+                     reference16[h * reference_stride_ + w]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+    }
+    return sad * 2;
+  }
+
   // Sum of Absolute Differences Average. Given two blocks, and a prediction
   // calculate the absolute difference between one pixel and average of the
   // corresponding and predicted pixels; accumulate.
@@ -290,6 +327,32 @@ class SADx4Test : public SADTestBase<SadMxNx4Param> {
   }
 };
 
+class SADSkipx4Test : public SADTestBase<SadMxNx4Param> {
+ public:
+  SADSkipx4Test() : SADTestBase(GetParam()) {}
+
+ protected:
+  void SADs(unsigned int *results) const {
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+
+    ASM_REGISTER_STATE_CHECK(params_.func(
+        source_data_, source_stride_, references, reference_stride_, results));
+  }
+
+  void CheckSADs() const {
+    uint32_t reference_sad;
+    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+
+    SADs(exp_sad);
+    for (int block = 0; block < 4; ++block) {
+      reference_sad = ReferenceSADSkip(GetBlockRefOffset(block));
+
+      EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
+    }
+  }
+};
+
 class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
  public:
   SADTest() : SADTestBase(GetParam()) {}
@@ -317,6 +380,33 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
   }
 };
 
+class SADSkipTest : public AbstractBench, public SADTestBase<SadMxNParam> {
+ public:
+  SADSkipTest() : SADTestBase(GetParam()) {}
+
+ protected:
+  unsigned int SAD(int block_idx) const {
+    unsigned int ret;
+    const uint8_t *const reference = GetReference(block_idx);
+
+    ASM_REGISTER_STATE_CHECK(ret = params_.func(source_data_, source_stride_,
+                                                reference, reference_stride_));
+    return ret;
+  }
+
+  void CheckSAD() const {
+    const unsigned int reference_sad = ReferenceSADSkip(GetBlockRefOffset(0));
+    const unsigned int exp_sad = SAD(0);
+
+    ASSERT_EQ(reference_sad, exp_sad);
+  }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_);
+  }
+};
+
 class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
@@ -397,6 +487,58 @@ TEST_P(SADTest, DISABLED_Speed) {
   PrintMedian(title);
 }
 
+TEST_P(SADSkipTest, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(reference_data_, reference_stride_, mask_);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(reference_data_, reference_stride_, 0);
+  CheckSAD();
+}
+
+TEST_P(SADSkipTest, ShortRef) {
+  const int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  const int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, ShortSrc) {
+  const int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  CheckSAD();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADavgTest, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(reference_data_, reference_stride_, mask_);
@@ -554,6 +696,105 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   reference_stride_ = tmp_stride;
 }
 
+TEST_P(SADSkipx4Test, MaxRef) {
+  FillConstant(source_data_, source_stride_, 0);
+  FillConstant(GetReference(0), reference_stride_, mask_);
+  FillConstant(GetReference(1), reference_stride_, mask_);
+  FillConstant(GetReference(2), reference_stride_, mask_);
+  FillConstant(GetReference(3), reference_stride_, mask_);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, MaxSrc) {
+  FillConstant(source_data_, source_stride_, mask_);
+  FillConstant(GetReference(0), reference_stride_, 0);
+  FillConstant(GetReference(1), reference_stride_, 0);
+  FillConstant(GetReference(2), reference_stride_, 0);
+  FillConstant(GetReference(3), reference_stride_, 0);
+  CheckSADs();
+}
+
+TEST_P(SADSkipx4Test, ShortRef) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, UnalignedRef) {
+  // The reference frame, but not the source frame, may be unaligned for
+  // certain types of searches.
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  reference_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, ShortSrc) {
+  int tmp_stride = source_stride_;
+  source_stride_ >>= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_stride_ = tmp_stride;
+}
+
+TEST_P(SADSkipx4Test, SrcAlignedByWidth) {
+  uint8_t *tmp_source_data = source_data_;
+  source_data_ += params_.width;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  CheckSADs();
+  source_data_ = tmp_source_data;
+}
+
+TEST_P(SADSkipx4Test, DISABLED_Speed) {
+  int tmp_stride = reference_stride_;
+  reference_stride_ -= 1;
+  FillRandom(source_data_, source_stride_);
+  FillRandom(GetReference(0), reference_stride_);
+  FillRandom(GetReference(1), reference_stride_);
+  FillRandom(GetReference(2), reference_stride_);
+  FillRandom(GetReference(3), reference_stride_);
+  const int kCountSpeedTestBlock = 500000000 / (params_.width * params_.height);
+  uint32_t reference_sad[4];
+  DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
+  vpx_usec_timer timer;
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSADSkip(GetBlockRefOffset(block));
+  }
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kCountSpeedTestBlock; ++i) {
+    SADs(exp_sad);
+  }
+  vpx_usec_timer_mark(&timer);
+  for (int block = 0; block < 4; ++block) {
+    EXPECT_EQ(reference_sad[block], exp_sad[block]) << "block " << block;
+  }
+  const int elapsed_time =
+      static_cast<int>(vpx_usec_timer_elapsed(&timer) / 1000);
+  printf("sad%dx%dx4 (%2dbit) time: %5d ms\n", params_.width, params_.height,
+         bit_depth_, elapsed_time);
+
+  reference_stride_ = tmp_stride;
+}
+
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -614,6 +855,56 @@ const SadMxNParam c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
+const SadSkipMxNParam skip_c_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_c),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_c),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_c),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_c),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_c),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_c),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_c),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_c),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_c),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_c),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_c, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_c, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_c, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_c, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_c, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_c, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_c, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_c, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_c, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_c, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipTest, ::testing::ValuesIn(skip_c_tests));
+
 const SadMxNAvgParam avg_c_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_c),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_c),
@@ -730,6 +1021,57 @@ const SadMxNx4Param x4d_c_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
+const SadSkipMxNx4Param skip_x4d_c_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_c),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_c),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_c),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_c),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_c),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_c),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_c),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_c),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_c),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_c),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_c),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_c, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_c, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_c, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_c, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_c, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_c, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_c, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_c, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_c, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(C, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_c_tests));
+
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
@@ -787,6 +1129,95 @@ const SadMxNParam neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNParam neon_dotprod_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_neon_dotprod),
+  SadMxNParam(64, 32, &vpx_sad64x32_neon_dotprod),
+  SadMxNParam(32, 64, &vpx_sad32x64_neon_dotprod),
+  SadMxNParam(32, 32, &vpx_sad32x32_neon_dotprod),
+  SadMxNParam(32, 16, &vpx_sad32x16_neon_dotprod),
+  SadMxNParam(16, 32, &vpx_sad16x32_neon_dotprod),
+  SadMxNParam(16, 16, &vpx_sad16x16_neon_dotprod),
+  SadMxNParam(16, 8, &vpx_sad16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADTest,
+                         ::testing::ValuesIn(neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNParam skip_neon_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_neon),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_neon),
+  SadSkipMxNParam(8, 4, &vpx_sad_skip_8x4_neon),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_neon),
+  SadSkipMxNParam(4, 4, &vpx_sad_skip_4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 8),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 8),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 8),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 10),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 10),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 10),
+  SadSkipMxNParam(4, 4, &vpx_highbd_sad_skip_4x4_neon, 12),
+  SadSkipMxNParam(4, 8, &vpx_highbd_sad_skip_4x8_neon, 12),
+  SadSkipMxNParam(8, 4, &vpx_highbd_sad_skip_8x4_neon, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_neon, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_neon, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_neon, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_neon, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_neon, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_neon, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_neon, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_neon, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_neon, 12),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadSkipMxNParam skip_neon_dotprod_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_neon_dotprod),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_neon_dotprod),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_neon_dotprod),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_neon_dotprod),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_neon_dotprod),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_neon_dotprod),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_neon_dotprod),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipTest,
+                         ::testing::ValuesIn(skip_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon),
@@ -845,6 +1276,21 @@ const SadMxNAvgParam avg_neon_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
+#if HAVE_NEON_DOTPROD
+const SadMxNAvgParam avg_neon_dotprod_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_neon_dotprod),
+  SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_neon_dotprod),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_neon_dotprod),
+  SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 32, &vpx_sad16x32_avg_neon_dotprod),
+  SadMxNAvgParam(16, 16, &vpx_sad16x16_avg_neon_dotprod),
+  SadMxNAvgParam(16, 8, &vpx_sad16x8_avg_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADavgTest,
+                         ::testing::ValuesIn(avg_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
 const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon),
   SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon),
@@ -899,6 +1345,92 @@ const SadMxNx4Param x4d_neon_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
+
+#if HAVE_NEON_DOTPROD
+const SadMxNx4Param x4d_neon_dotprod_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_neon_dotprod),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_neon_dotprod),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_neon_dotprod),
+  SadMxNx4Param(32, 16, &vpx_sad32x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 32, &vpx_sad16x32x4d_neon_dotprod),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_neon_dotprod),
+  SadMxNx4Param(16, 8, &vpx_sad16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADx4Test,
+                         ::testing::ValuesIn(x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
+
+const SadSkipMxNx4Param skip_x4d_neon_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_neon),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_neon),
+  SadSkipMxNx4Param(8, 4, &vpx_sad_skip_8x4x4d_neon),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_neon),
+  SadSkipMxNx4Param(4, 4, &vpx_sad_skip_4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 8),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 10),
+  SadSkipMxNx4Param(4, 4, &vpx_highbd_sad_skip_4x4x4d_neon, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 4, &vpx_highbd_sad_skip_8x4x4d_neon, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_neon, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_neon, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_neon, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_neon, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_neon, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_neon, 12),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_tests));
+
+#if HAVE_NEONE_DOTPROD
+const SadSkipMxNx4Param skip_x4d_neon_dotprod_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_neon_dotprod),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_neon_dotprod_tests));
+#endif  // HAVE_NEON_DOTPROD
 #endif  // HAVE_NEON
 
 //------------------------------------------------------------------------------
@@ -956,6 +1488,54 @@ const SadMxNParam sse2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
+const SadSkipMxNParam skip_sse2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_sse2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_sse2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_sse2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_sse2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_sse2),
+  SadSkipMxNParam(16, 32, &vpx_sad_skip_16x32_sse2),
+  SadSkipMxNParam(16, 16, &vpx_sad_skip_16x16_sse2),
+  SadSkipMxNParam(16, 8, &vpx_sad_skip_16x8_sse2),
+  SadSkipMxNParam(8, 16, &vpx_sad_skip_8x16_sse2),
+  SadSkipMxNParam(8, 8, &vpx_sad_skip_8x8_sse2),
+  SadSkipMxNParam(4, 8, &vpx_sad_skip_4x8_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 8),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 8),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 8),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 10),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 10),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 10),
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_sse2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_sse2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_sse2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_sse2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_sse2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_sse2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_sse2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_sse2, 12),
+  SadSkipMxNParam(8, 16, &vpx_highbd_sad_skip_8x16_sse2, 12),
+  SadSkipMxNParam(8, 8, &vpx_highbd_sad_skip_8x8_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipTest,
+                         ::testing::ValuesIn(skip_sse2_tests));
+
 const SadMxNAvgParam avg_sse2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_sse2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_sse2),
@@ -1065,6 +1645,57 @@ const SadMxNx4Param x4d_sse2_tests[] = {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(SSE2, SADx4Test, ::testing::ValuesIn(x4d_sse2_tests));
+
+const SadSkipMxNx4Param skip_x4d_sse2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_sse2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_sse2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_sse2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_sse2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_sse2),
+  SadSkipMxNx4Param(16, 32, &vpx_sad_skip_16x32x4d_sse2),
+  SadSkipMxNx4Param(16, 16, &vpx_sad_skip_16x16x4d_sse2),
+  SadSkipMxNx4Param(16, 8, &vpx_sad_skip_16x8x4d_sse2),
+  SadSkipMxNx4Param(8, 16, &vpx_sad_skip_8x16x4d_sse2),
+  SadSkipMxNx4Param(8, 8, &vpx_sad_skip_8x8x4d_sse2),
+  SadSkipMxNx4Param(4, 8, &vpx_sad_skip_4x8x4d_sse2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 8),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 8),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 10),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 10),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_sse2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_sse2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_sse2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 16, &vpx_highbd_sad_skip_8x16x4d_sse2, 12),
+  SadSkipMxNx4Param(8, 8, &vpx_highbd_sad_skip_8x8x4d_sse2, 12),
+  SadSkipMxNx4Param(4, 8, &vpx_highbd_sad_skip_4x8x4d_sse2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(SSE2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_sse2_tests));
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE3
@@ -1113,6 +1744,44 @@ const SadMxNParam avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
+const SadSkipMxNParam skip_avx2_tests[] = {
+  SadSkipMxNParam(64, 64, &vpx_sad_skip_64x64_avx2),
+  SadSkipMxNParam(64, 32, &vpx_sad_skip_64x32_avx2),
+  SadSkipMxNParam(32, 64, &vpx_sad_skip_32x64_avx2),
+  SadSkipMxNParam(32, 32, &vpx_sad_skip_32x32_avx2),
+  SadSkipMxNParam(32, 16, &vpx_sad_skip_32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 8),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 8),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 8),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 8),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 8),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 8),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 8),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 8),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 10),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 10),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 10),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 10),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 10),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 10),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 10),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 10),
+
+  SadSkipMxNParam(64, 64, &vpx_highbd_sad_skip_64x64_avx2, 12),
+  SadSkipMxNParam(64, 32, &vpx_highbd_sad_skip_64x32_avx2, 12),
+  SadSkipMxNParam(32, 64, &vpx_highbd_sad_skip_32x64_avx2, 12),
+  SadSkipMxNParam(32, 32, &vpx_highbd_sad_skip_32x32_avx2, 12),
+  SadSkipMxNParam(32, 16, &vpx_highbd_sad_skip_32x16_avx2, 12),
+  SadSkipMxNParam(16, 32, &vpx_highbd_sad_skip_16x32_avx2, 12),
+  SadSkipMxNParam(16, 16, &vpx_highbd_sad_skip_16x16_avx2, 12),
+  SadSkipMxNParam(16, 8, &vpx_highbd_sad_skip_16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipTest,
+                         ::testing::ValuesIn(skip_avx2_tests));
+
 const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_avx2),
   SadMxNAvgParam(64, 32, &vpx_sad64x32_avg_avx2),
@@ -1180,6 +1849,42 @@ const SadMxNx4Param x4d_avx2_tests[] = {
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
+const SadSkipMxNx4Param skip_x4d_avx2_tests[] = {
+  SadSkipMxNx4Param(64, 64, &vpx_sad_skip_64x64x4d_avx2),
+  SadSkipMxNx4Param(64, 32, &vpx_sad_skip_64x32x4d_avx2),
+  SadSkipMxNx4Param(32, 64, &vpx_sad_skip_32x64x4d_avx2),
+  SadSkipMxNx4Param(32, 32, &vpx_sad_skip_32x32x4d_avx2),
+  SadSkipMxNx4Param(32, 16, &vpx_sad_skip_32x16x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 8),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 8),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 8),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 10),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 10),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 10),
+  SadSkipMxNx4Param(64, 64, &vpx_highbd_sad_skip_64x64x4d_avx2, 12),
+  SadSkipMxNx4Param(64, 32, &vpx_highbd_sad_skip_64x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 64, &vpx_highbd_sad_skip_32x64x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 32, &vpx_highbd_sad_skip_32x32x4d_avx2, 12),
+  SadSkipMxNx4Param(32, 16, &vpx_highbd_sad_skip_32x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 32, &vpx_highbd_sad_skip_16x32x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 16, &vpx_highbd_sad_skip_16x16x4d_avx2, 12),
+  SadSkipMxNx4Param(16, 8, &vpx_highbd_sad_skip_16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SADSkipx4Test,
+                         ::testing::ValuesIn(skip_x4d_avx2_tests));
+
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
diff --git a/test/set_roi.cc b/test/set_roi.cc
index 167cf908f..693410e39 100644
--- a/test/set_roi.cc
+++ b/test/set_roi.cc
@@ -40,7 +40,7 @@ TEST(VP8RoiMapTest, ParameterCheck) {
 
   // Initialize elements of cpi with valid defaults.
   VP8_COMP cpi;
-  cpi.mb.e_mbd.mb_segement_abs_delta = SEGMENT_DELTADATA;
+  cpi.mb.e_mbd.mb_segment_abs_delta = SEGMENT_DELTADATA;
   cpi.cyclic_refresh_mode_enabled = 0;
   cpi.mb.e_mbd.segmentation_enabled = 0;
   cpi.mb.e_mbd.update_mb_segmentation_map = 0;
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index df6da8403..725d5eb85 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -21,9 +21,14 @@
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
 #include "test/util.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 using libvpx_test::ACMRandom;
+using ::testing::Combine;
+using ::testing::Range;
+using ::testing::ValuesIn;
 
 namespace {
 const int kNumIterations = 10000;
@@ -33,13 +38,13 @@ typedef std::tuple<SSI16Func, SSI16Func> SumSquaresParam;
 
 class SumSquaresTest : public ::testing::TestWithParam<SumSquaresParam> {
  public:
-  virtual ~SumSquaresTest() {}
-  virtual void SetUp() {
+  ~SumSquaresTest() override = default;
+  void SetUp() override {
     ref_func_ = GET_PARAM(0);
     tst_func_ = GET_PARAM(1);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   SSI16Func ref_func_;
@@ -126,4 +131,210 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
                                  &vpx_sum_squares_2d_i16_msa)));
 #endif  // HAVE_MSA
+
+typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int width, int height);
+
+struct TestSSEFuncs {
+  TestSSEFuncs(SSEFunc ref = nullptr, SSEFunc tst = nullptr, int depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(depth) {}
+  SSEFunc ref_func;  // Pointer to reference function
+  SSEFunc tst_func;  // Pointer to tested function
+  int bit_depth;
+};
+
+typedef std::tuple<TestSSEFuncs, int> SSETestParam;
+
+class SSETest : public ::testing::TestWithParam<SSETestParam> {
+ public:
+  ~SSETest() override = default;
+  void SetUp() override {
+    params_ = GET_PARAM(0);
+    width_ = GET_PARAM(1);
+    is_hbd_ =
+#if CONFIG_VP9_HIGHBITDEPTH
+        params_.ref_func == vpx_highbd_sse_c;
+#else
+        false;
+#endif
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ref_ = reinterpret_cast<uint8_t *>(vpx_memalign(32, 256 * 256 * 2));
+    ASSERT_NE(src_, nullptr);
+    ASSERT_NE(ref_, nullptr);
+  }
+
+  void TearDown() override {
+    vpx_free(src_);
+    vpx_free(ref_);
+  }
+  void RunTest(bool is_random, int width, int height, int run_times);
+
+  void GenRandomData(int width, int height, int stride) {
+    uint16_t *src16 = reinterpret_cast<uint16_t *>(src_);
+    uint16_t *ref16 = reinterpret_cast<uint16_t *>(ref_);
+    const int msb = 11;  // Up to 12 bit input
+    const int limit = 1 << (msb + 1);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          src_[ii * stride + jj] = rnd_.Rand8();
+          ref_[ii * stride + jj] = rnd_.Rand8();
+        } else {
+          src16[ii * stride + jj] = rnd_(limit);
+          ref16[ii * stride + jj] = rnd_(limit);
+        }
+      }
+    }
+  }
+
+  void GenExtremeData(int width, int height, int stride, uint8_t *data,
+                      int16_t val) {
+    uint16_t *data16 = reinterpret_cast<uint16_t *>(data);
+    for (int ii = 0; ii < height; ii++) {
+      for (int jj = 0; jj < width; jj++) {
+        if (!is_hbd_) {
+          data[ii * stride + jj] = static_cast<uint8_t>(val);
+        } else {
+          data16[ii * stride + jj] = val;
+        }
+      }
+    }
+  }
+
+ protected:
+  bool is_hbd_;
+  int width_;
+  TestSSEFuncs params_;
+  uint8_t *src_;
+  uint8_t *ref_;
+  ACMRandom rnd_;
+};
+GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(SSETest);
+
+void SSETest::RunTest(bool is_random, int width, int height, int run_times) {
+  int failed = 0;
+  vpx_usec_timer ref_timer, test_timer;
+  for (int k = 0; k < 3; k++) {
+    int stride = 4 << rnd_(7);  // Up to 256 stride
+    while (stride < width) {    // Make sure it's valid
+      stride = 4 << rnd_(7);
+    }
+    if (is_random) {
+      GenRandomData(width, height, stride);
+    } else {
+      const int msb = is_hbd_ ? 12 : 8;  // Up to 12 bit input
+      const int limit = (1 << msb) - 1;
+      if (k == 0) {
+        GenExtremeData(width, height, stride, src_, 0);
+        GenExtremeData(width, height, stride, ref_, limit);
+      } else {
+        GenExtremeData(width, height, stride, src_, limit);
+        GenExtremeData(width, height, stride, ref_, 0);
+      }
+    }
+    int64_t res_ref, res_tst;
+    uint8_t *src = src_;
+    uint8_t *ref = ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (is_hbd_) {
+      src = CONVERT_TO_BYTEPTR(src_);
+      ref = CONVERT_TO_BYTEPTR(ref_);
+    }
+#endif
+    res_ref = params_.ref_func(src, stride, ref, stride, width, height);
+    res_tst = params_.tst_func(src, stride, ref, stride, width, height);
+    if (run_times > 1) {
+      vpx_usec_timer_start(&ref_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.ref_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&ref_timer);
+      const int elapsed_time_c =
+          static_cast<int>(vpx_usec_timer_elapsed(&ref_timer));
+
+      vpx_usec_timer_start(&test_timer);
+      for (int j = 0; j < run_times; j++) {
+        params_.tst_func(src, stride, ref, stride, width, height);
+      }
+      vpx_usec_timer_mark(&test_timer);
+      const int elapsed_time_simd =
+          static_cast<int>(vpx_usec_timer_elapsed(&test_timer));
+
+      printf(
+          "c_time=%d \t simd_time=%d \t "
+          "gain=%d\n",
+          elapsed_time_c, elapsed_time_simd,
+          (elapsed_time_c / elapsed_time_simd));
+    } else {
+      if (!failed) {
+        failed = res_ref != res_tst;
+        EXPECT_EQ(res_ref, res_tst)
+            << "Error:" << (is_hbd_ ? "hbd " : " ") << k << " SSE Test ["
+            << width << "x" << height
+            << "] C output does not match optimized output.";
+      }
+    }
+  }
+}
+
+TEST_P(SSETest, OperationCheck) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 1);  // GenRandomData
+  }
+}
+
+TEST_P(SSETest, ExtremeValues) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(false, width_, height, 1);
+  }
+}
+
+TEST_P(SSETest, DISABLED_Speed) {
+  for (int height = 4; height <= 128; height += 4) {
+    RunTest(true, width_, height, 100);
+  }
+}
+
+#if HAVE_NEON
+TestSSEFuncs sse_neon[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_neon)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(NEON, SSETest,
+                         Combine(ValuesIn(sse_neon), Range(4, 129, 4)));
+#endif  // HAVE_NEON
+
+#if HAVE_NEON_DOTPROD
+TestSSEFuncs sse_neon_dotprod[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_neon_dotprod),
+};
+INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, SSETest,
+                         Combine(ValuesIn(sse_neon_dotprod), Range(4, 129, 4)));
+#endif  // HAVE_NEON_DOTPROD
+
+#if HAVE_SSE4_1
+TestSSEFuncs sse_sse4[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_sse4_1),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_sse4_1)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(SSE4_1, SSETest,
+                         Combine(ValuesIn(sse_sse4), Range(4, 129, 4)));
+#endif  // HAVE_SSE4_1
+
+#if HAVE_AVX2
+
+TestSSEFuncs sse_avx2[] = {
+  TestSSEFuncs(&vpx_sse_c, &vpx_sse_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  TestSSEFuncs(&vpx_highbd_sse_c, &vpx_highbd_sse_avx2)
+#endif
+};
+INSTANTIATE_TEST_SUITE_P(AVX2, SSETest,
+                         Combine(ValuesIn(sse_avx2), Range(4, 129, 4)));
+#endif  // HAVE_AVX2
 }  // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index a5c92e914..4c3aa1625 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -28,9 +28,9 @@ class SuperframeTest
  protected:
   SuperframeTest()
       : EncoderTest(GET_PARAM(0)), modified_buf_(nullptr), last_sf_pts_(0) {}
-  virtual ~SuperframeTest() {}
+  ~SuperframeTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     const SuperframeTestParam input = GET_PARAM(1);
     const libvpx_test::TestMode mode = std::get<kTestMode>(input);
@@ -39,17 +39,17 @@ class SuperframeTest
     sf_count_max_ = INT_MAX;
   }
 
-  virtual void TearDown() { delete[] modified_buf_; }
+  void TearDown() override { delete[] modified_buf_; }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
     }
   }
 
-  virtual const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
-      const vpx_codec_cx_pkt_t *pkt) {
+  const vpx_codec_cx_pkt_t *MutateEncoderOutputHook(
+      const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return pkt;
 
     const uint8_t *buffer = reinterpret_cast<uint8_t *>(pkt->data.frame.buf);
diff --git a/test/svc_datarate_test.cc b/test/svc_datarate_test.cc
index 484252ca4..aff4ace84 100644
--- a/test/svc_datarate_test.cc
+++ b/test/svc_datarate_test.cc
@@ -43,7 +43,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   }
 
  protected:
-  virtual ~DatarateOnePassCbrSvc() {}
+  ~DatarateOnePassCbrSvc() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -86,7 +86,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
     ksvc_flex_noupd_tlenh_ = false;
   }
-  virtual void BeginPassHook(unsigned int /*pass*/) {}
+  void BeginPassHook(unsigned int /*pass*/) override {}
 
   // Example pattern for spatial layers and 2 temporal layers used in the
   // bypass/flexible mode. The pattern corresponds to the pattern
@@ -179,8 +179,8 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
 
     if (video->frame() == 0) {
@@ -256,13 +256,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       temporal_layer_id_ = layer_id.temporal_layer_id;
       for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
-        ref_frame_config.duration[i] = 1;
+        ref_frame_config_.duration[i] = 1;
       }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
                                   1);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
 
     if (update_pattern_ && video->frame() >= 100) {
@@ -277,13 +277,13 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       temporal_layer_id_ = layer_id.temporal_layer_id;
       for (int i = 0; i < number_spatial_layers_; i++) {
         layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
-        ref_frame_config.duration[i] = 1;
+        ref_frame_config_.duration[i] = 1;
       }
       encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
       set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
-                                  number_spatial_layers_, 0, &ref_frame_config,
+                                  number_spatial_layers_, 0, &ref_frame_config_,
                                   0);
-      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
     }
 
     if (change_bitrate_ && video->frame() == 200) {
@@ -468,7 +468,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     return VPX_CODEC_OK;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     uint32_t sizes[8] = { 0 };
     uint32_t sizes_parsed[8] = { 0 };
     int count = 0;
@@ -571,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
     duration_ = (last_pts_ + 1) * timebase_;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
@@ -583,7 +583,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) {
+  void MismatchHook(const vpx_image_t *img1, const vpx_image_t *img2) override {
     // TODO(marpan): Look into why an assert is triggered in compute_psnr
     // for mismatch frames for the special test case: ksvc_flex_noupd_tlenh.
     // Has to do with dropped frames in bypass/flexible svc mode.
@@ -611,7 +611,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   bool single_layer_resize_;
   unsigned int top_sl_width_;
   unsigned int top_sl_height_;
-  vpx_svc_ref_frame_config_t ref_frame_config;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
   int update_pattern_;
   bool change_bitrate_;
   vpx_codec_pts_t last_pts_ref_;
@@ -639,7 +639,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
   bool ksvc_flex_noupd_tlenh_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
     cfg_.g_error_resilient = 1;
@@ -670,10 +670,10 @@ class DatarateOnePassCbrSvcSingleBR
   DatarateOnePassCbrSvcSingleBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSingleBR() {}
+  ~DatarateOnePassCbrSvcSingleBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1160,10 +1160,10 @@ class DatarateOnePassCbrSvcMultiBR
   DatarateOnePassCbrSvcMultiBR() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcMultiBR() {}
+  ~DatarateOnePassCbrSvcMultiBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1243,10 +1243,10 @@ class DatarateOnePassCbrSvcFrameDropMultiBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcFrameDropMultiBR() {}
+  ~DatarateOnePassCbrSvcFrameDropMultiBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1355,10 +1355,10 @@ class DatarateOnePassCbrSvcInterLayerPredSingleBR
       : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcInterLayerPredSingleBR() {}
+  ~DatarateOnePassCbrSvcInterLayerPredSingleBR() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1441,10 +1441,10 @@ class DatarateOnePassCbrSvcDenoiser
   DatarateOnePassCbrSvcDenoiser() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcDenoiser() {}
+  ~DatarateOnePassCbrSvcDenoiser() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1499,10 +1499,10 @@ class DatarateOnePassCbrSvcSmallKF
   DatarateOnePassCbrSvcSmallKF() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcSmallKF() {}
+  ~DatarateOnePassCbrSvcSmallKF() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
@@ -1702,10 +1702,10 @@ class DatarateOnePassCbrSvcPostencodeDrop
   DatarateOnePassCbrSvcPostencodeDrop() : DatarateOnePassCbrSvc(GET_PARAM(0)) {
     memset(&svc_params_, 0, sizeof(svc_params_));
   }
-  virtual ~DatarateOnePassCbrSvcPostencodeDrop() {}
+  ~DatarateOnePassCbrSvcPostencodeDrop() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     speed_setting_ = GET_PARAM(1);
diff --git a/test/svc_end_to_end_test.cc b/test/svc_end_to_end_test.cc
index 7300ce667..b4337ae75 100644
--- a/test/svc_end_to_end_test.cc
+++ b/test/svc_end_to_end_test.cc
@@ -45,19 +45,19 @@ class ScalePartitionOnePassCbrSvc
   }
 
  protected:
-  virtual ~ScalePartitionOnePassCbrSvc() {}
+  ~ScalePartitionOnePassCbrSvc() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -67,12 +67,12 @@ class ScalePartitionOnePassCbrSvc
       num_nonref_frames_++;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     ++mismatch_nframes_;
   }
 
-  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+  void SetConfig(const int /*num_temporal_layer*/) override {}
 
   unsigned int GetMismatchFrames() const { return mismatch_nframes_; }
   unsigned int GetNonRefFrames() const { return num_nonref_frames_; }
@@ -129,14 +129,14 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  virtual ~SyncFrameOnePassCbrSvc() {}
+  ~SyncFrameOnePassCbrSvc() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual bool DoDecode() const {
+  bool DoDecode() const override {
     return current_video_frame_ >= frame_to_start_decode_;
   }
 
@@ -225,8 +225,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     current_video_frame_ = video->frame();
     PreEncodeFrameHookSetup(video, encoder);
     if (video->frame() == 0) {
@@ -265,8 +265,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() < frame_to_sync_) {
       if (decode_to_layer_before_sync_ >= 0)
         decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
@@ -284,7 +284,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   }
 #endif
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -307,8 +307,8 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     if (current_video_frame_ >= frame_to_sync_) ++mismatch_nframes_;
   }
 
@@ -331,7 +331,7 @@ class SyncFrameOnePassCbrSvc : public OnePassCbrSvc,
   vpx_svc_ref_frame_config_t ref_frame_config_;
 
  private:
-  virtual void SetConfig(const int num_temporal_layer) {
+  void SetConfig(const int num_temporal_layer) override {
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 500;
     cfg_.rc_buf_sz = 1000;
@@ -657,15 +657,15 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
   }
 
  protected:
-  virtual ~LoopfilterOnePassCbrSvc() {}
+  ~LoopfilterOnePassCbrSvc() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     speed_setting_ = 7;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     PreEncodeFrameHookSetup(video, encoder);
     if (number_temporal_layers_ > 1 || number_spatial_layers_ > 1) {
       // Consider 3 cases:
@@ -694,7 +694,7 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
     }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Keep track of number of non-reference frames, needed for mismatch check.
     // Non-reference frames are top spatial and temporal layer frames,
     // for TL > 0.
@@ -704,12 +704,12 @@ class LoopfilterOnePassCbrSvc : public OnePassCbrSvc,
       num_nonref_frames_++;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     ++mismatch_nframes_;
   }
 
-  virtual void SetConfig(const int /*num_temporal_layer*/) {}
+  void SetConfig(const int /*num_temporal_layer*/) override {}
 
   int GetMismatchFrames() const { return mismatch_nframes_; }
   int GetNonRefFrames() const { return num_nonref_frames_; }
diff --git a/test/svc_test.h b/test/svc_test.h
index f1d727fd9..0026372de 100644
--- a/test/svc_test.h
+++ b/test/svc_test.h
@@ -36,7 +36,7 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~OnePassCbrSvc() {}
+  ~OnePassCbrSvc() override {}
 
   virtual void SetConfig(const int num_temporal_layer) = 0;
 
@@ -46,11 +46,11 @@ class OnePassCbrSvc : public ::libvpx_test::EncoderTest {
   virtual void PreEncodeFrameHookSetup(::libvpx_test::VideoSource *video,
                                        ::libvpx_test::Encoder *encoder);
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder);
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override;
 
   virtual void AssignLayerBitrates();
 
-  virtual void MismatchHook(const vpx_image_t *, const vpx_image_t *) {}
+  void MismatchHook(const vpx_image_t *, const vpx_image_t *) override {}
 
   vpx_svc_extra_cfg_t svc_params_;
   int64_t bits_in_buffer_model_[VPX_MAX_LAYERS];
diff --git a/test/test-data.mk b/test/test-data.mk
index 62a9d6ef1..9eabffae3 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -29,6 +29,7 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += niklas_640_480_30.yuv
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += bus_352x288_420_f20_b8.yuv
+LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += crowd_run_360p_10_150f.y4m
 
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 55f92a25d..a9decc6b6 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -870,3 +870,4 @@ bac455906360b45338a16dd626ac5f19bc36a307 *desktop_office1.1280_720-020.yuv
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
 8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
+ad9942a073e245585c93f764ea299382a65939a7 *crowd_run_360p_10_150f.y4m
diff --git a/test/test.mk b/test/test.mk
index f60d8f823..d4521f08b 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -7,6 +7,8 @@ LIBVPX_TEST_SRCS-yes += codec_factory.h
 LIBVPX_TEST_SRCS-yes += md5_helper.h
 LIBVPX_TEST_SRCS-yes += register_state_check.h
 LIBVPX_TEST_SRCS-yes += test.mk
+LIBVPX_TEST_SRCS-yes += init_vpx_test.cc
+LIBVPX_TEST_SRCS-yes += init_vpx_test.h
 LIBVPX_TEST_SRCS-yes += test_libvpx.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.cc
 LIBVPX_TEST_SRCS-yes += test_vectors.h
@@ -22,10 +24,6 @@ LIBVPX_TEST_SRCS-yes                   += ../md5_utils.h ../md5_utils.c
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ivf_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += ../y4minput.h ../y4minput.c
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += altref_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += alt_ref_aq_segment_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp8_datarate_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += encode_api_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += error_resilience_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += i420_video_source.h
@@ -37,6 +35,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS)    += yuv_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += config_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += cq_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_datarate_test.cc
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += byte_alignment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += decode_svc_test.cc
@@ -44,6 +43,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += external_frame_buffer_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += alt_ref_aq_segment_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += aq_segment_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
@@ -58,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_datarate_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
 
@@ -85,6 +87,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.cc
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += ../webmdec.h
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_skip_loopfilter_test.cc
+$(BUILD_PFX)third_party/libwebm/%.cc.o: CXXFLAGS += $(LIBWEBM_CXXFLAGS)
 endif
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += decode_api_test.cc
@@ -179,7 +182,7 @@ ifneq ($(CONFIG_REALTIME_ONLY),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += yuv_temporal_filter_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
-ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2)))
+ifneq (, $(filter yes, $(HAVE_SSE2) $(HAVE_AVX2) $(HAVE_NEON)))
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_block_error_test.cc
 endif
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
@@ -214,6 +217,8 @@ endif
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.cc
+TEST_INTRA_PRED_SPEED_SRCS-yes += init_vpx_test.h
 
 RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
 RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 28b3484a0..4c464a262 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -14,9 +14,11 @@
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
+#include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
+#include "test/init_vpx_test.h"
 #include "test/md5_helper.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -269,28 +271,32 @@ INTRA_PRED_TEST(NEON, TestIntraPred4, vpx_dc_predictor_4x4_neon,
                 vpx_dc_left_predictor_4x4_neon, vpx_dc_top_predictor_4x4_neon,
                 vpx_dc_128_predictor_4x4_neon, vpx_v_predictor_4x4_neon,
                 vpx_h_predictor_4x4_neon, vpx_d45_predictor_4x4_neon,
-                vpx_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr,
-                vpx_tm_predictor_4x4_neon)
+                vpx_d135_predictor_4x4_neon, vpx_d117_predictor_4x4_neon,
+                vpx_d153_predictor_4x4_neon, vpx_d207_predictor_4x4_neon,
+                vpx_d63_predictor_4x4_neon, vpx_tm_predictor_4x4_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred8, vpx_dc_predictor_8x8_neon,
                 vpx_dc_left_predictor_8x8_neon, vpx_dc_top_predictor_8x8_neon,
                 vpx_dc_128_predictor_8x8_neon, vpx_v_predictor_8x8_neon,
                 vpx_h_predictor_8x8_neon, vpx_d45_predictor_8x8_neon,
-                vpx_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr,
-                vpx_tm_predictor_8x8_neon)
+                vpx_d135_predictor_8x8_neon, vpx_d117_predictor_8x8_neon,
+                vpx_d153_predictor_8x8_neon, vpx_d207_predictor_8x8_neon,
+                vpx_d63_predictor_8x8_neon, vpx_tm_predictor_8x8_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred16, vpx_dc_predictor_16x16_neon,
                 vpx_dc_left_predictor_16x16_neon,
                 vpx_dc_top_predictor_16x16_neon,
                 vpx_dc_128_predictor_16x16_neon, vpx_v_predictor_16x16_neon,
                 vpx_h_predictor_16x16_neon, vpx_d45_predictor_16x16_neon,
-                vpx_d135_predictor_16x16_neon, nullptr, nullptr, nullptr,
-                nullptr, vpx_tm_predictor_16x16_neon)
+                vpx_d135_predictor_16x16_neon, vpx_d117_predictor_16x16_neon,
+                vpx_d153_predictor_16x16_neon, vpx_d207_predictor_16x16_neon,
+                vpx_d63_predictor_16x16_neon, vpx_tm_predictor_16x16_neon)
 INTRA_PRED_TEST(NEON, TestIntraPred32, vpx_dc_predictor_32x32_neon,
                 vpx_dc_left_predictor_32x32_neon,
                 vpx_dc_top_predictor_32x32_neon,
                 vpx_dc_128_predictor_32x32_neon, vpx_v_predictor_32x32_neon,
                 vpx_h_predictor_32x32_neon, vpx_d45_predictor_32x32_neon,
-                vpx_d135_predictor_32x32_neon, nullptr, nullptr, nullptr,
-                nullptr, vpx_tm_predictor_32x32_neon)
+                vpx_d135_predictor_32x32_neon, vpx_d117_predictor_32x32_neon,
+                vpx_d153_predictor_32x32_neon, vpx_d207_predictor_32x32_neon,
+                vpx_d63_predictor_32x32_neon, vpx_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
@@ -344,6 +350,15 @@ INTRA_PRED_TEST(VSX, TestIntraPred32, vpx_dc_predictor_32x32_vsx,
                 vpx_tm_predictor_32x32_vsx)
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INTRA_PRED_TEST(LSX, TestIntraPred8, vpx_dc_predictor_8x8_lsx, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr)
+INTRA_PRED_TEST(LSX, TestIntraPred16, vpx_dc_predictor_16x16_lsx, nullptr,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, nullptr, nullptr, nullptr)
+#endif  // HAVE_LSX
+
 // -----------------------------------------------------------------------------
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -561,37 +576,41 @@ HIGHBD_INTRA_PRED_TEST(
     vpx_highbd_dc_left_predictor_4x4_neon, vpx_highbd_dc_top_predictor_4x4_neon,
     vpx_highbd_dc_128_predictor_4x4_neon, vpx_highbd_v_predictor_4x4_neon,
     vpx_highbd_h_predictor_4x4_neon, vpx_highbd_d45_predictor_4x4_neon,
-    vpx_highbd_d135_predictor_4x4_neon, nullptr, nullptr, nullptr, nullptr,
-    vpx_highbd_tm_predictor_4x4_neon)
+    vpx_highbd_d135_predictor_4x4_neon, vpx_highbd_d117_predictor_4x4_neon,
+    vpx_highbd_d153_predictor_4x4_neon, vpx_highbd_d207_predictor_4x4_neon,
+    vpx_highbd_d63_predictor_4x4_neon, vpx_highbd_tm_predictor_4x4_neon)
 HIGHBD_INTRA_PRED_TEST(
     NEON, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_neon,
     vpx_highbd_dc_left_predictor_8x8_neon, vpx_highbd_dc_top_predictor_8x8_neon,
     vpx_highbd_dc_128_predictor_8x8_neon, vpx_highbd_v_predictor_8x8_neon,
     vpx_highbd_h_predictor_8x8_neon, vpx_highbd_d45_predictor_8x8_neon,
-    vpx_highbd_d135_predictor_8x8_neon, nullptr, nullptr, nullptr, nullptr,
-    vpx_highbd_tm_predictor_8x8_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred16,
-                       vpx_highbd_dc_predictor_16x16_neon,
-                       vpx_highbd_dc_left_predictor_16x16_neon,
-                       vpx_highbd_dc_top_predictor_16x16_neon,
-                       vpx_highbd_dc_128_predictor_16x16_neon,
-                       vpx_highbd_v_predictor_16x16_neon,
-                       vpx_highbd_h_predictor_16x16_neon,
-                       vpx_highbd_d45_predictor_16x16_neon,
-                       vpx_highbd_d135_predictor_16x16_neon, nullptr, nullptr,
-                       nullptr, nullptr, vpx_highbd_tm_predictor_16x16_neon)
-HIGHBD_INTRA_PRED_TEST(NEON, TestHighbdIntraPred32,
-                       vpx_highbd_dc_predictor_32x32_neon,
-                       vpx_highbd_dc_left_predictor_32x32_neon,
-                       vpx_highbd_dc_top_predictor_32x32_neon,
-                       vpx_highbd_dc_128_predictor_32x32_neon,
-                       vpx_highbd_v_predictor_32x32_neon,
-                       vpx_highbd_h_predictor_32x32_neon,
-                       vpx_highbd_d45_predictor_32x32_neon,
-                       vpx_highbd_d135_predictor_32x32_neon, nullptr, nullptr,
-                       nullptr, nullptr, vpx_highbd_tm_predictor_32x32_neon)
+    vpx_highbd_d135_predictor_8x8_neon, vpx_highbd_d117_predictor_8x8_neon,
+    vpx_highbd_d153_predictor_8x8_neon, vpx_highbd_d207_predictor_8x8_neon,
+    vpx_highbd_d63_predictor_8x8_neon, vpx_highbd_tm_predictor_8x8_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred16, vpx_highbd_dc_predictor_16x16_neon,
+    vpx_highbd_dc_left_predictor_16x16_neon,
+    vpx_highbd_dc_top_predictor_16x16_neon,
+    vpx_highbd_dc_128_predictor_16x16_neon, vpx_highbd_v_predictor_16x16_neon,
+    vpx_highbd_h_predictor_16x16_neon, vpx_highbd_d45_predictor_16x16_neon,
+    vpx_highbd_d135_predictor_16x16_neon, vpx_highbd_d117_predictor_16x16_neon,
+    vpx_highbd_d153_predictor_16x16_neon, vpx_highbd_d207_predictor_16x16_neon,
+    vpx_highbd_d63_predictor_16x16_neon, vpx_highbd_tm_predictor_16x16_neon)
+HIGHBD_INTRA_PRED_TEST(
+    NEON, TestHighbdIntraPred32, vpx_highbd_dc_predictor_32x32_neon,
+    vpx_highbd_dc_left_predictor_32x32_neon,
+    vpx_highbd_dc_top_predictor_32x32_neon,
+    vpx_highbd_dc_128_predictor_32x32_neon, vpx_highbd_v_predictor_32x32_neon,
+    vpx_highbd_h_predictor_32x32_neon, vpx_highbd_d45_predictor_32x32_neon,
+    vpx_highbd_d135_predictor_32x32_neon, vpx_highbd_d117_predictor_32x32_neon,
+    vpx_highbd_d153_predictor_32x32_neon, vpx_highbd_d207_predictor_32x32_neon,
+    vpx_highbd_d63_predictor_32x32_neon, vpx_highbd_tm_predictor_32x32_neon)
 #endif  // HAVE_NEON
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#include "test/test_libvpx.cc"
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  ::libvpx_test::init_vpx_test();
+  return RUN_ALL_TESTS();
+}
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 222a83f8c..c1798b8b8 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -7,69 +7,12 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include <string>
 
+#include "test/init_vpx_test.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
-#include "./vpx_config.h"
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-#include "vpx_ports/x86.h"
-#endif
-extern "C" {
-#if CONFIG_VP8
-extern void vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-extern void vp9_rtcd();
-#endif  // CONFIG_VP9
-extern void vpx_dsp_rtcd();
-extern void vpx_scale_rtcd();
-}
-
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-static void append_negative_gtest_filter(const char *str) {
-  std::string filter = ::testing::FLAGS_gtest_filter;
-  // Negative patterns begin with one '-' followed by a ':' separated list.
-  if (filter.find('-') == std::string::npos) filter += '-';
-  filter += str;
-  ::testing::FLAGS_gtest_filter = filter;
-}
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
-
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
-
-#if VPX_ARCH_X86 || VPX_ARCH_X86_64
-  const int simd_caps = x86_simd_caps();
-  if (!(simd_caps & HAS_MMX)) append_negative_gtest_filter(":MMX.*:MMX/*");
-  if (!(simd_caps & HAS_SSE)) append_negative_gtest_filter(":SSE.*:SSE/*");
-  if (!(simd_caps & HAS_SSE2)) append_negative_gtest_filter(":SSE2.*:SSE2/*");
-  if (!(simd_caps & HAS_SSE3)) append_negative_gtest_filter(":SSE3.*:SSE3/*");
-  if (!(simd_caps & HAS_SSSE3)) {
-    append_negative_gtest_filter(":SSSE3.*:SSSE3/*");
-  }
-  if (!(simd_caps & HAS_SSE4_1)) {
-    append_negative_gtest_filter(":SSE4_1.*:SSE4_1/*");
-  }
-  if (!(simd_caps & HAS_AVX)) append_negative_gtest_filter(":AVX.*:AVX/*");
-  if (!(simd_caps & HAS_AVX2)) append_negative_gtest_filter(":AVX2.*:AVX2/*");
-  if (!(simd_caps & HAS_AVX512)) {
-    append_negative_gtest_filter(":AVX512.*:AVX512/*");
-  }
-#endif  // VPX_ARCH_X86 || VPX_ARCH_X86_64
-
-#if !CONFIG_SHARED
-// Shared library builds don't support whitebox tests
-// that exercise internal symbols.
-#if CONFIG_VP8
-  vp8_rtcd();
-#endif  // CONFIG_VP8
-#if CONFIG_VP9
-  vp9_rtcd();
-#endif  // CONFIG_VP9
-  vpx_dsp_rtcd();
-  vpx_scale_rtcd();
-#endif  // !CONFIG_SHARED
-
+  ::libvpx_test::init_vpx_test();
   return RUN_ALL_TESTS();
 }
diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index ca990f4dd..ee552113c 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc
@@ -48,7 +48,7 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
 #endif
   }
 
-  virtual ~TestVectorTest() {
+  ~TestVectorTest() override {
     if (md5_file_) fclose(md5_file_);
   }
 
@@ -59,9 +59,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   }
 
 #if CONFIG_VP9_DECODER
-  virtual void PreDecodeFrameHook(
-      const libvpx_test::CompressedVideoSource &video,
-      libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(const libvpx_test::CompressedVideoSource &video,
+                          libvpx_test::Decoder *decoder) override {
     if (video.frame_number() == 0 && mt_mode_ >= 0) {
       if (mt_mode_ == 1) {
         decoder->Control(VP9D_SET_LOOP_FILTER_OPT, 1);
@@ -77,8 +76,8 @@ class TestVectorTest : public ::libvpx_test::DecoderTest,
   }
 #endif
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     const unsigned int frame_number) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             const unsigned int frame_number) override {
     ASSERT_NE(md5_file_, nullptr);
     char expected_md5[33];
     char junk[128];
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index d92c13f88..dab6e531b 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -36,18 +36,18 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
   }
 
-  virtual ~TileIndependenceTest() {
+  ~TileIndependenceTest() override {
     delete fw_dec_;
     delete inv_dec_;
   }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(libvpx_test::kTwoPassGood);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
     }
@@ -65,7 +65,7 @@ class TileIndependenceTest : public ::libvpx_test::EncoderTest,
     md5->Add(img);
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     UpdateMD5(fw_dec_, pkt, &md5_fw_order_);
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
diff --git a/test/timestamp_test.cc b/test/timestamp_test.cc
index 645a9f2ff..00abf8f31 100644
--- a/test/timestamp_test.cc
+++ b/test/timestamp_test.cc
@@ -42,16 +42,16 @@ class DummyTimebaseVideoSource : public ::libvpx_test::DummyVideoSource {
            (static_cast<double>(framerate_numerator_) / framerate_denominator_);
   }
 
-  virtual vpx_codec_pts_t pts() const {
+  vpx_codec_pts_t pts() const override {
     return static_cast<vpx_codec_pts_t>(frame_ * FrameDuration() +
                                         starting_pts_ + 0.5);
   }
 
-  virtual unsigned long duration() const {
+  unsigned long duration() const override {
     return static_cast<unsigned long>(FrameDuration() + 0.5);
   }
 
-  virtual vpx_rational_t timebase() const { return timebase_; }
+  vpx_rational_t timebase() const override { return timebase_; }
 
   void set_starting_pts(int64_t starting_pts) { starting_pts_ = starting_pts; }
 
@@ -67,9 +67,9 @@ class TimestampTest
       public ::libvpx_test::CodecTestWithParam<libvpx_test::TestMode> {
  protected:
   TimestampTest() : EncoderTest(GET_PARAM(0)) {}
-  virtual ~TimestampTest() {}
+  ~TimestampTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
   }
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 0e4a0a5c0..d0dd24df3 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -280,7 +280,12 @@ run_tests() {
     test_end "${test}"
   done
 
-  local tested_config="$(test_configuration_target) @ $(current_hash)"
+  # C vs SIMD tests are run for x86 32-bit, 64-bit and ARM platform
+  if [ "${test_name}" = "vp9_c_vs_simd_encode" ]; then
+    local tested_config="$(current_hash)"
+  else
+    local tested_config="$(test_configuration_target) @ $(current_hash)"
+  fi
   echo "${test_name}: Done, all tests pass for ${tested_config}."
 }
 
diff --git a/test/variance_test.cc b/test/variance_test.cc
index a6c8ef048..b8320e9ce 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -210,7 +210,7 @@ class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
 
-  virtual ~SumOfSquaresTest() { libvpx_test::ClearSystemState(); }
+  ~SumOfSquaresTest() override { libvpx_test::ClearSystemState(); }
 
  protected:
   void ConstTest();
@@ -289,7 +289,7 @@ template <typename FunctionType>
 class MainTestClass
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -308,7 +308,7 @@ class MainTestClass
 #endif
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (use_high_bit_depth()) {
       // TODO(skal): remove!
@@ -568,7 +568,7 @@ template <typename FunctionType>
 class SubpelVarianceTest
     : public ::testing::TestWithParam<TestParams<FunctionType> > {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -592,7 +592,7 @@ class SubpelVarianceTest
     ASSERT_NE(ref_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     if (!use_high_bit_depth()) {
       vpx_free(src_);
       vpx_free(sec_);
@@ -773,6 +773,7 @@ TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
 TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
 TEST_P(VpxMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxMseTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
 TEST_P(VpxVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
@@ -1428,7 +1429,10 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(5, 4, &vpx_variance32x16_avx2),
                       VarianceParams(4, 5, &vpx_variance16x32_avx2),
                       VarianceParams(4, 4, &vpx_variance16x16_avx2),
-                      VarianceParams(4, 3, &vpx_variance16x8_avx2)));
+                      VarianceParams(4, 3, &vpx_variance16x8_avx2),
+                      VarianceParams(3, 4, &vpx_variance8x16_avx2),
+                      VarianceParams(3, 3, &vpx_variance8x8_avx2),
+                      VarianceParams(3, 2, &vpx_variance8x4_avx2)));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VpxSubpelVarianceTest,
@@ -1450,8 +1454,10 @@ INSTANTIATE_TEST_SUITE_P(NEON, VpxSseTest,
                                                      &vpx_get4x4sse_cs_neon)));
 
 INSTANTIATE_TEST_SUITE_P(NEON, VpxMseTest,
-                         ::testing::Values(MseParams(4, 4,
-                                                     &vpx_mse16x16_neon)));
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon),
+                                           MseParams(4, 3, &vpx_mse16x8_neon),
+                                           MseParams(3, 4, &vpx_mse8x16_neon),
+                                           MseParams(3, 3, &vpx_mse8x8_neon)));
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxVarianceTest,
@@ -1469,6 +1475,35 @@ INSTANTIATE_TEST_SUITE_P(
                       VarianceParams(2, 3, &vpx_variance4x8_neon),
                       VarianceParams(2, 2, &vpx_variance4x4_neon)));
 
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxSseTest,
+    ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxMseTest,
+    ::testing::Values(MseParams(4, 4, &vpx_mse16x16_neon_dotprod),
+                      MseParams(4, 3, &vpx_mse16x8_neon_dotprod),
+                      MseParams(3, 4, &vpx_mse8x16_neon_dotprod),
+                      MseParams(3, 3, &vpx_mse8x8_neon_dotprod)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_neon_dotprod),
+                      VarianceParams(6, 5, &vpx_variance64x32_neon_dotprod),
+                      VarianceParams(5, 6, &vpx_variance32x64_neon_dotprod),
+                      VarianceParams(5, 5, &vpx_variance32x32_neon_dotprod),
+                      VarianceParams(5, 4, &vpx_variance32x16_neon_dotprod),
+                      VarianceParams(4, 5, &vpx_variance16x32_neon_dotprod),
+                      VarianceParams(4, 4, &vpx_variance16x16_neon_dotprod),
+                      VarianceParams(4, 3, &vpx_variance16x8_neon_dotprod),
+                      VarianceParams(3, 4, &vpx_variance8x16_neon_dotprod),
+                      VarianceParams(3, 3, &vpx_variance8x8_neon_dotprod),
+                      VarianceParams(3, 2, &vpx_variance8x4_neon_dotprod),
+                      VarianceParams(2, 3, &vpx_variance4x8_neon_dotprod),
+                      VarianceParams(2, 2, &vpx_variance4x4_neon_dotprod)));
+#endif  // HAVE_NEON_DOTPROD
+
 INSTANTIATE_TEST_SUITE_P(
     NEON, VpxSubpelVarianceTest,
     ::testing::Values(
@@ -1505,6 +1540,36 @@ INSTANTIATE_TEST_SUITE_P(
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_neon, VPX_BITS_12),
+        MseParams(4, 3, &vpx_highbd_12_mse16x8_neon, VPX_BITS_12),
+        MseParams(3, 4, &vpx_highbd_12_mse8x16_neon, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_neon, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_neon, VPX_BITS_10),
+        MseParams(4, 3, &vpx_highbd_10_mse16x8_neon, VPX_BITS_10),
+        MseParams(3, 4, &vpx_highbd_10_mse8x16_neon, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_neon, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon, VPX_BITS_8)));
+
+// TODO(webm:1819): Re-enable when vpx_highbd_8_mse16x16_neon_dotprod, etc. can
+// be used again.
+#if 0
+#if HAVE_NEON_DOTPROD
+INSTANTIATE_TEST_SUITE_P(
+    NEON_DOTPROD, VpxHBDMseTest,
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_neon_dotprod, VPX_BITS_8),
+        MseParams(4, 3, &vpx_highbd_8_mse16x8_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 4, &vpx_highbd_8_mse8x16_neon_dotprod, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_neon_dotprod, VPX_BITS_8)));
+#endif  // HAVE_NEON_DOTPROD
+#endif  // 0
+
+INSTANTIATE_TEST_SUITE_P(
     NEON, VpxHBDVarianceTest,
     ::testing::Values(
         VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
@@ -1572,6 +1637,10 @@ INSTANTIATE_TEST_SUITE_P(
                              12),
         SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
                              12),
+        SubpelVarianceParams(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_neon,
+                             12),
+        SubpelVarianceParams(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_neon,
+                             12),
         SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
                              10),
         SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
@@ -1594,6 +1663,10 @@ INSTANTIATE_TEST_SUITE_P(
                              10),
         SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
                              10),
+        SubpelVarianceParams(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_neon,
+                             10),
+        SubpelVarianceParams(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_neon,
+                             10),
         SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
                              8),
         SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
@@ -1613,7 +1686,9 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
                              8),
         SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
-        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon, 8),
+        SubpelVarianceParams(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_neon, 8),
+        SubpelVarianceParams(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_neon,
                              8)));
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1652,6 +1727,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
                                 12),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x8_neon,
+                                12),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance4x4_neon,
+                                12),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
                                 10),
@@ -1685,6 +1766,12 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
                                 10),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x8_neon,
+                                10),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance4x4_neon,
+                                10),
         SubpelAvgVarianceParams(6, 6,
                                 &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
                                 8),
@@ -1717,6 +1804,12 @@ INSTANTIATE_TEST_SUITE_P(
                                 8),
         SubpelAvgVarianceParams(3, 2,
                                 &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x8_neon,
+                                8),
+        SubpelAvgVarianceParams(2, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance4x4_neon,
                                 8)));
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/test/video_source.h b/test/video_source.h
index a10ff6fb0..2194126f1 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) {
   return fopen(path_to_source.c_str(), "rb");
 }
 
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
   file_name->clear();
 #if defined(_WIN32)
   char fname[MAX_PATH];
@@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) {
     // Assume for now that the filename generated is unique per process
     if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
       file_name->assign(fname);
-      return fopen(fname, "wb+");
+      return fopen(fname, io_mode);
     }
   }
   return nullptr;
@@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) {
   const int fd = mkstemp(temp_file_name.get());
   if (fd == -1) return nullptr;
   *file_name = temp_file_name.get();
-  return fdopen(fd, "wb+");
+  return fdopen(fd, io_mode);
 #endif
 }
 
 class TempOutFile {
  public:
-  TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+  TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+  TempOutFile(const char *io_mode) {
+    file_ = GetTempOutFile(&file_name_, io_mode);
+  }
   ~TempOutFile() {
     CloseFile();
     if (!file_name_.empty()) {
@@ -160,35 +163,35 @@ class DummyVideoSource : public VideoSource {
     ReallocImage();
   }
 
-  virtual ~DummyVideoSource() { vpx_img_free(img_); }
+  ~DummyVideoSource() override { vpx_img_free(img_); }
 
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { 1, 30 };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   void set_limit(unsigned int limit) { limit_ = limit; }
 
@@ -235,7 +238,7 @@ class RandomVideoSource : public DummyVideoSource {
 
  protected:
   // Reset the RNG to get a matching stream for the second pass
-  virtual void Begin() {
+  void Begin() override {
     frame_ = 0;
     rnd_.Reset(seed_);
     FillFrame();
@@ -243,7 +246,7 @@ class RandomVideoSource : public DummyVideoSource {
 
   // 15 frames of noise, followed by 15 static frames. Reset to 0 rather
   // than holding previous frames to encourage keyframes to be thrown.
-  virtual void FillFrame() {
+  void FillFrame() override {
     if (img_) {
       if (frame_ % 30 < 15) {
         for (size_t i = 0; i < raw_sz_; ++i) img_->img_data[i] = rnd_.Rand8();
diff --git a/test/vp8_datarate_test.cc b/test/vp8_datarate_test.cc
index 64a861d15..aee27af66 100644
--- a/test/vp8_datarate_test.cc
+++ b/test/vp8_datarate_test.cc
@@ -24,10 +24,10 @@ class DatarateTestLarge
  public:
   DatarateTestLarge() : EncoderTest(GET_PARAM(0)) {}
 
-  virtual ~DatarateTestLarge() {}
+  ~DatarateTestLarge() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(GET_PARAM(1));
     set_cpu_used_ = GET_PARAM(2);
@@ -47,8 +47,8 @@ class DatarateTestLarge
     use_roi_ = false;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_NOISE_SENSITIVITY, denoiser_on_);
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
@@ -74,7 +74,7 @@ class DatarateTestLarge
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -121,7 +121,7 @@ class DatarateTestLarge
     ++frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
 
@@ -301,7 +301,7 @@ TEST_P(DatarateTestLarge, DropFramesMultiThreads) {
 
 class DatarateTestRealTime : public DatarateTestLarge {
  public:
-  virtual ~DatarateTestRealTime() {}
+  ~DatarateTestRealTime() override = default;
 };
 
 #if CONFIG_TEMPORAL_DENOISING
diff --git a/test/vp8_denoiser_sse2_test.cc b/test/vp8_denoiser_sse2_test.cc
index 8cb84ddd8..7fa867d8b 100644
--- a/test/vp8_denoiser_sse2_test.cc
+++ b/test/vp8_denoiser_sse2_test.cc
@@ -30,11 +30,11 @@ namespace {
 const int kNumPixels = 16 * 16;
 class VP8DenoiserTest : public ::testing::TestWithParam<int> {
  public:
-  virtual ~VP8DenoiserTest() {}
+  ~VP8DenoiserTest() override = default;
 
-  virtual void SetUp() { increase_denoising_ = GetParam(); }
+  void SetUp() override { increase_denoising_ = GetParam(); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   int increase_denoising_;
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 1b73a72a0..66d5c151c 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -74,7 +74,7 @@ using libvpx_test::ACMRandom;
 
 class FdctTest : public ::testing::TestWithParam<FdctFunc> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     fdct_func_ = GetParam();
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
diff --git a/test/vp8_fragments_test.cc b/test/vp8_fragments_test.cc
index 6e5baf229..01b4c2120 100644
--- a/test/vp8_fragments_test.cc
+++ b/test/vp8_fragments_test.cc
@@ -17,9 +17,9 @@ class VP8FragmentsTest : public ::libvpx_test::EncoderTest,
                          public ::testing::Test {
  protected:
   VP8FragmentsTest() : EncoderTest(&::libvpx_test::kVP8) {}
-  virtual ~VP8FragmentsTest() {}
+  ~VP8FragmentsTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     const unsigned long init_flags =  // NOLINT(runtime/int)
         VPX_CODEC_USE_OUTPUT_PARTITION;
     InitializeConfig();
diff --git a/test/vp8_ratectrl_rtc_test.cc b/test/vp8_ratectrl_rtc_test.cc
index 7410f3c01..9fbc1d4d9 100644
--- a/test/vp8_ratectrl_rtc_test.cc
+++ b/test/vp8_ratectrl_rtc_test.cc
@@ -25,7 +25,7 @@
 namespace {
 
 struct Vp8RCTestVideo {
-  Vp8RCTestVideo() {}
+  Vp8RCTestVideo() = default;
   Vp8RCTestVideo(const char *name_, int width_, int height_,
                  unsigned int frames_)
       : name(name_), width(width_), height(height_), frames(frames_) {}
@@ -52,11 +52,12 @@ class Vp8RcInterfaceTest
       public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
  public:
   Vp8RcInterfaceTest()
-      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
-  virtual ~Vp8RcInterfaceTest() {}
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false),
+        frame_drop_thresh_(0) {}
+  ~Vp8RcInterfaceTest() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
@@ -111,8 +112,8 @@ class Vp8RcInterfaceTest
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (rc_cfg_.ts_number_layers > 1) {
       const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
       const int frame_flags =
@@ -127,56 +128,79 @@ class Vp8RcInterfaceTest
         encoder->Control(VP8E_SET_CPUUSED, -6);
         encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
         encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
-      } else if (frame_params_.frame_type == INTER_FRAME) {
+      } else if (frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
         // Disable golden frame update.
         frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
         frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
       }
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
     encoder_exit_ = video->frame() == test_video_.frames;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     if (encoder_exit_) {
       return;
     }
     int qp;
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-    rc_api_->ComputeQP(frame_params_);
-    ASSERT_EQ(rc_api_->GetQP(), qp);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+    } else {
+      num_drops_++;
+    }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
   }
 
   void RunOneLayer() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfig();
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
                                          test_video_.frames);
 
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
   }
 
   void RunPeriodicKey() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     key_interval_ = 100;
+    frame_drop_thresh_ = 30;
     SetConfig();
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -188,11 +212,9 @@ class Vp8RcInterfaceTest
   void RunTemporalLayers2TL() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(2);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -204,11 +226,9 @@ class Vp8RcInterfaceTest
   void RunTemporalLayers3TL() {
     test_video_ = GET_PARAM(2);
     target_bitrate_ = GET_PARAM(1);
-    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
-    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
     SetConfigTemporalLayers(3);
     rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
-    rc_api_->UpdateRateControl(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
 
     ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
                                          test_video_.height, 30, 1, 0,
@@ -217,6 +237,28 @@ class Vp8RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunTemporalLayers3TLDropFrames() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    frame_drop_thresh_ = 30;
+    num_drops_ = 0;
+    // Use lower target_bitrate and max_quantizer to trigger drops.
+    target_bitrate_ = target_bitrate_ >> 2;
+    SetConfigTemporalLayers(3);
+    rc_cfg_.max_quantizer = 56;
+    cfg_.rc_max_quantizer = 56;
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
  private:
   void SetConfig() {
     rc_cfg_.width = test_video_.width;
@@ -232,6 +274,7 @@ class Vp8RcInterfaceTest
     rc_cfg_.max_intra_bitrate_pct = 1000;
     rc_cfg_.framerate = 30.0;
     rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = test_video_.width;
@@ -250,6 +293,7 @@ class Vp8RcInterfaceTest
     cfg_.rc_target_bitrate = target_bitrate_;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
   }
 
   void SetConfigTemporalLayers(int temporal_layers) {
@@ -265,6 +309,7 @@ class Vp8RcInterfaceTest
     rc_cfg_.overshoot_pct = 50;
     rc_cfg_.max_intra_bitrate_pct = 1000;
     rc_cfg_.framerate = 30.0;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
     if (temporal_layers == 2) {
       rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
       rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
@@ -298,6 +343,7 @@ class Vp8RcInterfaceTest
     cfg_.rc_target_bitrate = target_bitrate_;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
     // 2 Temporal layers, no spatial layers, CBR mode.
     cfg_.ss_number_layers = 1;
     cfg_.ts_number_layers = temporal_layers;
@@ -325,16 +371,24 @@ class Vp8RcInterfaceTest
   Vp8RCTestVideo test_video_;
   libvpx::VP8FrameParamsQpRTC frame_params_;
   bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
 };
 
 TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(Vp8RcInterfaceTest, OneLayerDropFrames) { RunOneLayerDropFrames(); }
+
 TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
 
 TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
 
 TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
 
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TLDropFrames) {
+  RunTemporalLayers3TLDropFrames();
+}
+
 VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
                            ::testing::Values(200, 400, 1000),
                            ::testing::ValuesIn(kVp8RCTestVectors));
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index c7e6f1af0..3882326d2 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -86,9 +86,9 @@ class ArfFreqTest
       : EncoderTest(GET_PARAM(0)), test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)), min_arf_requested_(GET_PARAM(3)) {}
 
-  virtual ~ArfFreqTest() {}
+  ~ArfFreqTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(test_encode_param_.mode);
     if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
@@ -104,7 +104,7 @@ class ArfFreqTest
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     min_run_ = ARF_NOT_SEEN;
     run_of_visible_frames_ = 0;
   }
@@ -126,7 +126,7 @@ class ArfFreqTest
     return frames;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->kind != VPX_CODEC_CX_FRAME_PKT) return;
     const int frames = GetNumFramesInPkt(pkt);
     if (frames == 1) {
@@ -145,8 +145,8 @@ class ArfFreqTest
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
diff --git a/test/vp9_block_error_test.cc b/test/vp9_block_error_test.cc
index b93b014e6..0645341ac 100644
--- a/test/vp9_block_error_test.cc
+++ b/test/vp9_block_error_test.cc
@@ -53,14 +53,14 @@ int64_t BlockError8BitWrapper(const tran_low_t *coeff,
 
 class BlockErrorTest : public ::testing::TestWithParam<BlockErrorParam> {
  public:
-  virtual ~BlockErrorTest() {}
-  virtual void SetUp() {
+  ~BlockErrorTest() override = default;
+  void SetUp() override {
     error_block_op_ = GET_PARAM(0);
     ref_error_block_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
   }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   vpx_bit_depth_t bit_depth_;
@@ -197,4 +197,22 @@ INSTANTIATE_TEST_SUITE_P(
                                  &BlockError8BitWrapper<vp9_block_error_c>,
                                  VPX_BITS_8)));
 #endif  // HAVE_AVX2
+
+#if HAVE_NEON
+const BlockErrorParam neon_block_error_tests[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_10),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_12),
+  make_tuple(&vp9_highbd_block_error_neon, &vp9_highbd_block_error_c,
+             VPX_BITS_8),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  make_tuple(&BlockError8BitWrapper<vp9_block_error_neon>,
+             &BlockError8BitWrapper<vp9_block_error_c>, VPX_BITS_8)
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, BlockErrorTest,
+                         ::testing::ValuesIn(neon_block_error_tests));
+#endif  // HAVE_NEON
 }  // namespace
diff --git a/test/vp9_c_vs_simd_encode.sh b/test/vp9_c_vs_simd_encode.sh
new file mode 100755
index 000000000..03843610d
--- /dev/null
+++ b/test/vp9_c_vs_simd_encode.sh
@@ -0,0 +1,420 @@
+#!/bin/sh
+##
+##  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+##
+##  Use of this source code is governed by a BSD-style license
+##  that can be found in the LICENSE file in the root of the source
+##  tree. An additional intellectual property rights grant can be found
+##  in the file PATENTS.  All contributing project authors may
+##  be found in the AUTHORS file in the root of the source tree.
+##
+##  This script checks the bit exactness between C and SIMD
+##  implementations of VP9 encoder.
+##
+. $(dirname $0)/tools_common.sh
+
+TEST_BITRATES="1600 6400"
+PRESETS="good rt"
+TEST_CLIPS="yuv_raw_input y4m_360p_10bit_input yuv_480p_raw_input y4m_720p_input"
+OUT_FILE_SUFFIX=".ivf"
+SCRIPT_DIR=$(dirname "$0")
+LIBVPX_SOURCE_DIR=$(cd "${SCRIPT_DIR}/.."; pwd)
+
+# Clips used in test.
+YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
+YUV_480P_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_640_480_30.yuv"
+Y4M_360P_10BIT_INPUT="${LIBVPX_TEST_DATA_PATH}/crowd_run_360p_10_150f.y4m"
+Y4M_720P_INPUT="${LIBVPX_TEST_DATA_PATH}/niklas_1280_720_30.y4m"
+
+# Number of frames to test.
+VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT=20
+
+# Create a temporary directory for output files.
+if [ -n "${TMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TMPDIR}"
+elif [ -n "${TEMPDIR}" ]; then
+  VPX_TEST_TEMP_ROOT="${TEMPDIR}"
+else
+  VPX_TEST_TEMP_ROOT=/tmp
+fi
+
+VPX_TEST_OUTPUT_DIR="${VPX_TEST_TEMP_ROOT}/vpx_test_$$"
+
+if ! mkdir -p "${VPX_TEST_OUTPUT_DIR}" || \
+   [ ! -d "${VPX_TEST_OUTPUT_DIR}" ]; then
+  echo "${0##*/}: Cannot create output directory, giving up."
+  echo "${0##*/}:   VPX_TEST_OUTPUT_DIR=${VPX_TEST_OUTPUT_DIR}"
+  exit 1
+fi
+
+elog() {
+  echo "$@" 1>&2
+}
+
+# Echoes path to $1 when it's executable and exists in ${VPX_TEST_OUTPUT_DIR},
+# or an empty string. Caller is responsible for testing the string once the
+# function returns.
+vp9_enc_tool_path() {
+  local target="$1"
+  local tool_path="${VPX_TEST_OUTPUT_DIR}/build_target_${target}/vpxenc"
+
+  if [ ! -x "${tool_path}" ]; then
+    tool_path=""
+  fi
+  echo "${tool_path}"
+}
+
+# Environment check: Make sure input and source directories are available.
+vp9_c_vs_simd_enc_verify_environment() {
+  if [ ! -e "${YUV_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${YUV_480P_RAW_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_720P_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -e "${Y4M_360P_10BIT_INPUT}" ]; then
+    elog "libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
+    return 1
+  fi
+  if [ ! -d "$LIBVPX_SOURCE_DIR" ]; then
+    elog "LIBVPX_SOURCE_DIR does not exist."
+    return 1
+  fi
+}
+
+# This is not needed since tools_common.sh does the same cleanup.
+# Keep the code here for our reference.
+# cleanup() {
+#   rm -rf  ${VPX_TEST_OUTPUT_DIR}
+# }
+
+# Echo VPX_SIMD_CAPS_MASK for different instruction set architecture.
+avx512f() {
+   echo "0x1FF"
+}
+
+avx2() {
+   echo "0x0FF"
+}
+
+sse4_1() {
+   echo "0x03F"
+}
+
+ssse3() {
+   echo "0x01F"
+}
+
+sse2() {
+   echo "0x007"
+}
+
+# Echo clip details to be used as input to vpxenc.
+yuv_raw_input() {
+  echo ""${YUV_RAW_INPUT}"
+       --width=352
+       --height=288
+       --bit-depth=8
+       --profile=0"
+}
+
+yuv_480p_raw_input() {
+  echo ""${YUV_480P_RAW_INPUT}"
+       --width=640
+       --height=480
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_720p_input() {
+  echo ""${Y4M_720P_INPUT}"
+       --bit-depth=8
+       --profile=0"
+}
+
+y4m_360p_10bit_input() {
+  echo ""${Y4M_360P_10BIT_INPUT}"
+       --bit-depth=10
+       --profile=2"
+}
+
+has_x86_isa_extn() {
+  instruction_set=$1
+  if ! grep -q "$instruction_set" /proc/cpuinfo; then
+    # This instruction_set is not supported.
+    return 1
+  fi
+  # This instruction_set is supported.
+  return 0
+}
+
+# Echo good encode params for use with VP9 encoder.
+vp9_encode_good_params() {
+  echo "--codec=vp9 \
+  --good \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --end-usage=vbr \
+  --kf-max-dist=160 \
+  --kf-min-dist=0 \
+  --lag-in-frames=19 \
+  --max-q=63 \
+  --min-q=0 \
+  --passes=2 \
+  --undershoot-pct=100 \
+  --overshoot-pct=100 \
+  --verbose \
+  --auto-alt-ref=1 \
+  --drop-frame=0 \
+  --bias-pct=50 \
+  --minsection-pct=0 \
+  --maxsection-pct=2000 \
+  --arnr-maxframes=7 \
+  --arnr-strength=5 \
+  --sharpness=0 \
+  --frame-parallel=0"
+}
+
+# Echo realtime encode params for use with VP9 encoder.
+vp9_encode_rt_params() {
+  echo "--codec=vp9 \
+  --rt \
+  --test-decode=fatal \
+  --ivf \
+  --threads=1 \
+  --static-thresh=0 \
+  --tile-columns=0 \
+  --tile-rows=0 \
+  --end-usage=cbr \
+  --kf-max-dist=90000 \
+  --lag-in-frames=0 \
+  --max-q=58 \
+  --min-q=2 \
+  --passes=1 \
+  --undershoot-pct=50 \
+  --overshoot-pct=50 \
+  --verbose \
+  --row-mt=0 \
+  --buf-sz=1000 \
+  --buf-initial-sz=500 \
+  --buf-optimal-sz=600 \
+  --max-intra-rate=300 \
+  --resize-allowed=0 \
+  --noise-sensitivity=0 \
+  --aq-mode=3 \
+  --error-resilient=0"
+}
+
+# Configures for the given target in the
+# ${VPX_TEST_OUTPUT_DIR}/build_target_${target} directory.
+vp9_enc_build() {
+  local target=$1
+  local configure="$2"
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  mkdir -p "$tmp_build_dir"
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+
+  echo "Building target: ${target}"
+  local config_args="--disable-install-docs \
+             --enable-unit-tests \
+             --enable-debug \
+             --enable-postproc \
+             --enable-vp9-postproc \
+             --enable-vp9-temporal-denoising \
+             --enable-vp9-highbitdepth"
+
+  eval "$configure" --target="${target}" "${config_args}" ${devnull}
+  eval make -j$(nproc) ${devnull}
+  echo "Done building target: ${target}"
+  cd "${save_dir}"
+}
+
+compare_enc_output() {
+  local target=$1
+  local cpu=$2
+  local clip=$3
+  local bitrate=$4
+  local preset=$5
+  if ! diff -q ${VPX_TEST_OUTPUT_DIR}/Out-generic-gnu-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+       ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then
+    elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset"
+    return 1
+  fi
+}
+
+vp9_enc_test() {
+  local encoder="$1"
+  local target=$2
+  if [ -z "$(vp9_enc_tool_path "${target}")" ]; then
+    elog "vpxenc not found. It must exist in ${VPX_TEST_OUTPUT_DIR}/build_target_${target} path"
+    return 1
+  fi
+
+  local tmp_build_dir=${VPX_TEST_OUTPUT_DIR}/build_target_${target}
+  local save_dir="$PWD"
+  cd "$tmp_build_dir"
+  for preset in ${PRESETS}; do
+    if [ "${preset}" = "good" ]; then
+      local max_cpu_used=5
+      local test_params=vp9_encode_good_params
+    elif [ "${preset}" = "rt" ]; then
+      local max_cpu_used=9
+      local test_params=vp9_encode_rt_params
+    else
+      elog "Invalid preset"
+      cd "${save_dir}"
+      return 1
+    fi
+
+    # Enable armv8 test for real-time only
+    if [ "${preset}" = "good" ] && [ "${target}" = "armv8-linux-gcc" ]; then
+      continue
+    fi
+
+    for cpu in $(seq 0 $max_cpu_used); do
+      for clip in ${TEST_CLIPS}; do
+        for bitrate in ${TEST_BITRATES}; do
+          eval "${encoder}" $($clip) $($test_params) \
+          "--limit=${VP9_ENCODE_C_VS_SIMD_TEST_FRAME_LIMIT}" \
+          "--cpu-used=${cpu}" "--target-bitrate=${bitrate}" "-o" \
+          ${VPX_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \
+          ${devnull}
+
+          if [ "${target}" != "generic-gnu" ]; then
+            if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then
+              # Find the mismatch
+              cd "${save_dir}"
+              return 1
+            fi
+          fi
+        done
+      done
+    done
+  done
+  cd "${save_dir}"
+}
+
+vp9_test_generic() {
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+  local target="generic-gnu"
+
+  echo "Build for: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  vp9_enc_test $encoder "${target}"
+}
+
+# This function encodes VP9 bitstream by enabling SSE2, SSSE3, SSE4_1, AVX2, AVX512f as there are
+# no functions with MMX, SSE, SSE3 and AVX specialization.
+# The value of environment variable 'VPX_SIMD_CAPS' controls enabling of different instruction
+# set extension optimizations. The value of the flag 'VPX_SIMD_CAPS' and the corresponding
+# instruction set extension optimization enabled are as follows:
+# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX
+#   1     1    1    1      1    1    1    1   1  -> 0x1FF -> Enable AVX512 and lower variants
+#   0     1    1    1      1    1    1    1   1  -> 0x0FF -> Enable AVX2 and lower variants
+#   0     0    1    1      1    1    1    1   1  -> 0x07F -> Enable AVX and lower variants
+#   0     0    0    1      1    1    1    1   1  -> 0x03F  -> Enable SSE4_1 and lower variants
+#   0     0    0    0      1    1    1    1   1  -> 0x01F  -> Enable SSSE3 and lower variants
+#   0     0    0    0      0    1    1    1   1  -> 0x00F  -> Enable SSE3 and lower variants
+#   0     0    0    0      0    0    1    1   1  -> 0x007  -> Enable SSE2 and lower variants
+#   0     0    0    0      0    0    0    1   1  -> 0x003  -> Enable SSE and lower variants
+#   0     0    0    0      0    0    0    0   1  -> 0x001  -> Enable MMX
+## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "VPX_SIMD_CAPS_MASK" as
+#  all x86_64 platforms implement sse2.
+vp9_test_x86() {
+  local arch=$1
+
+  if ! uname -m | grep -q "x86"; then
+    elog "Machine architecture is not x86 or x86_64"
+    return 0
+  fi
+
+  if [ $arch = "x86" ]; then
+    local target="x86-linux-gcc"
+  elif [ $arch = "x86_64" ]; then
+    local target="x86_64-linux-gcc"
+  fi
+
+  local x86_isa_variants="avx512f avx2 sse4_1 ssse3 sse2"
+  local configure="$LIBVPX_SOURCE_DIR/configure"
+
+  echo "Build for x86: ${target}"
+  vp9_enc_build ${target} ${configure}
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  for isa in $x86_isa_variants; do
+    # Note that if has_x86_isa_extn returns 1, it is false, and vice versa.
+    if ! has_x86_isa_extn $isa; then
+      echo "${isa} is not supported in this machine"
+      continue
+    fi
+    export VPX_SIMD_CAPS_MASK=$($isa)
+    if ! vp9_enc_test $encoder ${target}; then
+      # Find the mismatch
+      return 1
+    fi
+    unset VPX_SIMD_CAPS_MASK
+  done
+}
+
+vp9_test_arm() {
+  local target="armv8-linux-gcc"
+  local configure="CROSS=aarch64-linux-gnu- $LIBVPX_SOURCE_DIR/configure --extra-cflags=-march=armv8.4-a \
+          --extra-cxxflags=-march=armv8.4-a"
+  echo "Build for arm64: ${target}"
+  vp9_enc_build ${target} "${configure}"
+
+  local encoder="$(vp9_enc_tool_path "${target}")"
+  if ! vp9_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" ${target}; then
+    # Find the mismatch
+    return 1
+  fi
+}
+
+vp9_c_vs_simd_enc_test() {
+  # Test Generic
+  vp9_test_generic
+
+  # Test x86 (32 bit)
+  echo "vp9 test for x86 (32 bit): Started."
+  if ! vp9_test_x86 "x86"; then
+    echo "vp9 test for x86 (32 bit): Done, test failed."
+    return 1
+  else
+    echo "vp9 test for x86 (32 bit): Done, all tests passed."
+  fi
+
+  # Test x86_64 (64 bit)
+  if [ "$(eval uname -m)" = "x86_64" ]; then
+    echo "vp9 test for x86_64 (64 bit): Started."
+    if ! vp9_test_x86 "x86_64"; then
+      echo "vp9 test for x86_64 (64 bit): Done, test failed."
+      return 1
+    else
+      echo "vp9 test for x86_64 (64 bit): Done, all tests passed."
+    fi
+  fi
+
+  # Test ARM
+  echo "vp9_test_arm: Started."
+  if ! vp9_test_arm; then
+    echo "vp9 test for arm: Done, test failed."
+    return 1
+  else
+    echo "vp9 test for arm: Done, all tests passed."
+  fi
+}
+
+# Setup a trap function to clean up build, and output files after tests complete.
+# trap cleanup EXIT
+
+run_tests vp9_c_vs_simd_enc_verify_environment vp9_c_vs_simd_enc_test
diff --git a/test/vp9_datarate_test.cc b/test/vp9_datarate_test.cc
index 7e9180749..4bc909920 100644
--- a/test/vp9_datarate_test.cc
+++ b/test/vp9_datarate_test.cc
@@ -28,7 +28,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
   }
 
  protected:
-  virtual ~DatarateTestVP9() {}
+  ~DatarateTestVP9() override = default;
 
   virtual void ResetModel() {
     last_pts_ = 0;
@@ -113,8 +113,8 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     return layer_id;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
@@ -164,7 +164,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     duration_ = 0;
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
     // Time since last timestamp = duration.
     vpx_codec_pts_t duration = pkt->data.frame.pts - last_pts_;
 
@@ -202,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
          ++layer) {
       duration_ = (last_pts_ + 1) * timebase_;
@@ -243,7 +243,7 @@ class DatarateTestVP9RealTimeMultiBR
   DatarateTestVP9RealTimeMultiBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -259,7 +259,7 @@ class DatarateTestVP9LargeVBR
   DatarateTestVP9LargeVBR() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -579,10 +579,10 @@ class DatarateTestVP9RealTime : public DatarateTestVP9,
                                 public ::libvpx_test::CodecTestWithParam<int> {
  public:
   DatarateTestVP9RealTime() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTime() {}
+  ~DatarateTestVP9RealTime() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -731,10 +731,10 @@ class DatarateTestVP9RealTimeDeltaQUV
       public ::libvpx_test::CodecTestWith2Params<int, int> {
  public:
   DatarateTestVP9RealTimeDeltaQUV() : DatarateTestVP9(GET_PARAM(0)) {}
-  virtual ~DatarateTestVP9RealTimeDeltaQUV() {}
+  ~DatarateTestVP9RealTimeDeltaQUV() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -779,7 +779,7 @@ class DatarateTestVP9PostEncodeDrop
   DatarateTestVP9PostEncodeDrop() : DatarateTestVP9(GET_PARAM(0)) {}
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     set_cpu_used_ = GET_PARAM(1);
@@ -819,17 +819,17 @@ class DatarateTestVP9FrameQp
       public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
  public:
   DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
-  virtual ~DatarateTestVP9FrameQp() {}
+  ~DatarateTestVP9FrameQp() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     ResetModel();
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     set_cpu_used_ = 7;
     DatarateTestVP9::PreEncodeFrameHook(video, encoder);
     frame_qp_ = static_cast<int>(rnd_.RandRange(64));
@@ -837,7 +837,7 @@ class DatarateTestVP9FrameQp
     frame_++;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     int qp = 0;
     vpx_svc_layer_id_t layer_id;
     if (frame_ >= total_frame_) return;
@@ -847,8 +847,8 @@ class DatarateTestVP9FrameQp
     temporal_layer_id_ = layer_id.temporal_layer_id;
   }
 
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {
     if (frame_ >= total_frame_) return;
     ASSERT_TRUE(cfg_.temporal_layering_mode ==
                     VP9E_TEMPORAL_LAYERING_MODE_0212 &&
@@ -945,7 +945,7 @@ TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
  public:
-  virtual ~DatarateTestVP9RealTimeDenoiser() {}
+  ~DatarateTestVP9RealTimeDenoiser() override = default;
 };
 
 // Check basic datarate targeting, for a single bitrate, when denoiser is on.
diff --git a/test/vp9_denoiser_test.cc b/test/vp9_denoiser_test.cc
index d884b7eb9..831f83305 100644
--- a/test/vp9_denoiser_test.cc
+++ b/test/vp9_denoiser_test.cc
@@ -42,11 +42,11 @@ class VP9DenoiserTest
     : public ::testing::Test,
       public ::testing::WithParamInterface<VP9DenoiserTestParam> {
  public:
-  virtual ~VP9DenoiserTest() {}
+  ~VP9DenoiserTest() override = default;
 
-  virtual void SetUp() { bs_ = GET_PARAM(1); }
+  void SetUp() override { bs_ = GET_PARAM(1); }
 
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
   BLOCK_SIZE bs_;
diff --git a/test/vp9_encoder_parms_get_to_decoder.cc b/test/vp9_encoder_parms_get_to_decoder.cc
index ce2198c59..0e182c76d 100644
--- a/test/vp9_encoder_parms_get_to_decoder.cc
+++ b/test/vp9_encoder_parms_get_to_decoder.cc
@@ -62,9 +62,9 @@ class VpxEncoderParmsGetToDecoder
   VpxEncoderParmsGetToDecoder()
       : EncoderTest(GET_PARAM(0)), encode_parms(GET_PARAM(1)) {}
 
-  virtual ~VpxEncoderParmsGetToDecoder() {}
+  ~VpxEncoderParmsGetToDecoder() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kTwoPassGood);
     cfg_.g_lag_in_frames = 25;
@@ -74,8 +74,8 @@ class VpxEncoderParmsGetToDecoder
     cfg_.rc_target_bitrate = test_video_.bitrate;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_COLOR_SPACE, encode_parms.cs);
       encoder->Control(VP9E_SET_COLOR_RANGE, encode_parms.color_range);
@@ -95,9 +95,9 @@ class VpxEncoderParmsGetToDecoder
     }
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res_dec,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder *decoder) {
+  bool HandleDecodeResult(const vpx_codec_err_t res_dec,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder *decoder) override {
     vpx_codec_ctx_t *const vp9_decoder = decoder->GetDecoder();
     vpx_codec_alg_priv_t *const priv =
         reinterpret_cast<vpx_codec_alg_priv_t *>(vp9_decoder->priv);
diff --git a/test/vp9_end_to_end_test.cc b/test/vp9_end_to_end_test.cc
index 7a85db26a..79be4ee14 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/vp9_end_to_end_test.cc
@@ -89,9 +89,9 @@ class EndToEndTestAdaptiveRDThresh
       : EncoderTest(GET_PARAM(0)), cpu_used_start_(GET_PARAM(1)),
         cpu_used_end_(GET_PARAM(2)) {}
 
-  virtual ~EndToEndTestAdaptiveRDThresh() {}
+  ~EndToEndTestAdaptiveRDThresh() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_lag_in_frames = 0;
@@ -102,8 +102,8 @@ class EndToEndTestAdaptiveRDThresh
     dec_cfg_.threads = 4;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_start_);
       encoder->Control(VP9E_SET_ROW_MT, 1);
@@ -131,9 +131,9 @@ class EndToEndTestLarge
     denoiser_on_ = 0;
   }
 
-  virtual ~EndToEndTestLarge() {}
+  ~EndToEndTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -149,18 +149,18 @@ class EndToEndTestLarge
     dec_cfg_.threads = 4;
   }
 
-  virtual void BeginPassHook(unsigned int) {
+  void BeginPassHook(unsigned int) override {
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
@@ -207,9 +207,9 @@ class EndToEndTestLoopFilterThreading
   EndToEndTestLoopFilterThreading()
       : EncoderTest(GET_PARAM(0)), use_loop_filter_opt_(GET_PARAM(1)) {}
 
-  virtual ~EndToEndTestLoopFilterThreading() {}
+  ~EndToEndTestLoopFilterThreading() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     cfg_.g_threads = 2;
@@ -221,16 +221,16 @@ class EndToEndTestLoopFilterThreading
     dec_cfg_.threads = GET_PARAM(2);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
     encoder->Control(VP9E_SET_TILE_COLUMNS, 4 - video->frame() % 5);
   }
 
-  virtual void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Decoder *decoder) {
+  void PreDecodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Decoder *decoder) override {
     if (video->frame() == 0) {
       decoder->Control(VP9D_SET_LOOP_FILTER_OPT, use_loop_filter_opt_ ? 1 : 0);
     }
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index 238366cb6..c8d3cba7f 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -21,12 +21,12 @@
 namespace {
 // FIRSTPASS_STATS struct:
 // {
-//   25 double members;
+//   26 double members;
 //   1 int64_t member;
 // }
 // Whenever FIRSTPASS_STATS struct is modified, the following constants need to
 // be revisited.
-const int kDbl = 25;
+const int kDbl = 26;
 const int kInt = 1;
 const size_t kFirstPassStatsSz = kDbl * sizeof(double) + kInt * sizeof(int64_t);
 
@@ -44,9 +44,9 @@ class VPxFirstPassEncoderThreadTest
     firstpass_stats_.buf = nullptr;
     firstpass_stats_.sz = 0;
   }
-  virtual ~VPxFirstPassEncoderThreadTest() { free(firstpass_stats_.buf); }
+  ~VPxFirstPassEncoderThreadTest() override { free(firstpass_stats_.buf); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -57,19 +57,19 @@ class VPxFirstPassEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     abort_ = false;
   }
 
-  virtual void EndPassHook() {
+  void EndPassHook() override {
     // For first pass stats test, only run first pass encoder.
     if (first_pass_only_ && cfg_.g_pass == VPX_RC_FIRST_PASS)
       abort_ |= first_pass_only_;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode in 2-pass mode.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -87,7 +87,7 @@ class VPxFirstPassEncoderThreadTest
     }
   }
 
-  virtual void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void StatsPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     const uint8_t *const pkt_buf =
         reinterpret_cast<uint8_t *>(pkt->data.twopass_stats.buf);
     const size_t pkt_size = pkt->data.twopass_stats.sz;
@@ -185,7 +185,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if using or not using row-mt generates close stats.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Test single thread vs multiple threads
   row_mt_mode_ = 1;
@@ -199,7 +199,7 @@ TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 
   // Compare to check if single-thread and multi-thread stats are close enough.
-  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 1000.0));
+  ASSERT_NO_FATAL_FAILURE(compare_fp_stats(&firstpass_stats_, 400.0));
 
   // Bit exact test in row_mt mode.
   // When row_mt_mode_=1 and using >1 threads, the encoder generates bit exact
@@ -233,9 +233,9 @@ class VPxEncoderThreadTest
     psnr_ = 0.0;
     nframes_ = 0;
   }
-  virtual ~VPxEncoderThreadTest() {}
+  ~VPxEncoderThreadTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
 
@@ -252,14 +252,14 @@ class VPxEncoderThreadTest
     cfg_.rc_min_quantizer = 0;
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     encoder_initialized_ = false;
     psnr_ = 0.0;
     nframes_ = 0;
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
+                          ::libvpx_test::Encoder *encoder) override {
     if (!encoder_initialized_) {
       // Encode 4 column tiles.
       encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
@@ -280,21 +280,21 @@ class VPxEncoderThreadTest
     }
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     psnr_ += pkt->data.psnr.psnr[0];
     nframes_++;
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t /*pts*/) {
+  void DecompressedFrameHook(const vpx_image_t &img,
+                             vpx_codec_pts_t /*pts*/) override {
     ::libvpx_test::MD5 md5_res;
     md5_res.Add(&img);
     md5_.push_back(md5_res.Get());
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
-                                  const libvpx_test::VideoSource & /*video*/,
-                                  libvpx_test::Decoder * /*decoder*/) {
+  bool HandleDecodeResult(const vpx_codec_err_t res,
+                          const libvpx_test::VideoSource & /*video*/,
+                          libvpx_test::Decoder * /*decoder*/) override {
     if (res != VPX_CODEC_OK) {
       EXPECT_EQ(VPX_CODEC_OK, res);
       return false;
diff --git a/test/vp9_ext_ratectrl_test.cc b/test/vp9_ext_ratectrl_test.cc
index 2bfa6281d..33fa05c65 100644
--- a/test/vp9_ext_ratectrl_test.cc
+++ b/test/vp9_ext_ratectrl_test.cc
@@ -18,6 +18,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 #include "vp9/simple_encode.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
@@ -41,7 +42,7 @@ constexpr int kDefaultMaxGfInterval = 16;
 constexpr int kReadMinGfInterval = 5;
 constexpr int kReadMaxGfInterval = 13;
 const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
-const double kPsnrThreshold = 30.50;
+const double kPsnrThreshold = 30.4;
 
 struct ToyRateCtrl {
   int magic_number;
@@ -151,6 +152,19 @@ vpx_rc_status_t rc_send_firstpass_stats_gop_short(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_send_tpl_gop_stats(vpx_rc_model_t rate_ctrl_model,
+                                      const VpxTplGopStats *tpl_gop_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_GT(tpl_gop_stats->size, 0);
+
+  for (int i = 0; i < tpl_gop_stats->size; ++i) {
+    EXPECT_GT(tpl_gop_stats->frame_stats_list[i].num_blocks, 0);
+  }
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_encodeframe_decision(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -384,7 +398,7 @@ vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
     EXPECT_EQ(encode_frame_info->show_index, 3);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
     EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
-    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
   }
 
   // When the model recommends an invalid q, valid range [0, 255],
@@ -678,7 +692,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_QP;
       rc_funcs.create_model = rc_create_model;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
@@ -721,10 +735,11 @@ class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
 
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.send_tpl_gop_stats = rc_send_tpl_gop_stats;
       rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
       rc_funcs.get_gop_decision = rc_get_gop_decision;
       rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
@@ -768,7 +783,7 @@ class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -816,7 +831,7 @@ class ExtRateCtrlTestGOPShortOverlay
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -865,7 +880,7 @@ class ExtRateCtrlTestGOPShortNoARF
       encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
       encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
 
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
@@ -919,7 +934,7 @@ class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
-      vpx_rc_funcs_t rc_funcs;
+      vpx_rc_funcs_t rc_funcs = {};
       rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
       rc_funcs.create_model = rc_create_model_gop_short;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc
index ccace719e..c69d43efb 100644
--- a/test/vp9_intrapred_test.cc
+++ b/test/vp9_intrapred_test.cc
@@ -55,6 +55,21 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
     ref_dst_ = ref_dst;
     int error_count = 0;
     for (int i = 0; i < count_test_block; ++i) {
+      // TODO(webm:1797): Some of the optimised predictor implementations rely
+      // on the trailing half of the above_row_ being a copy of the final
+      // element, however relying on this in some cases can cause the MD5 tests
+      // to fail. We have fixed all of these cases for Neon, so fill the whole
+      // of above_row_ randomly.
+#if HAVE_NEON
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x < 2 * block_size; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+#else
       // Fill edges with random data, try first with saturated values.
       for (int x = -1; x < block_size; x++) {
         if (i == 0) {
@@ -66,6 +81,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
       for (int x = block_size; x < 2 * block_size; x++) {
         above_row_[x] = above_row_[block_size - 1];
       }
+#endif
       for (int y = 0; y < block_size; y++) {
         if (i == 0) {
           left_col_[y] = mask_;
@@ -80,7 +96,7 @@ class IntraPredTest : public ::testing::TestWithParam<PredParam> {
   }
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     params_ = this->GetParam();
     stride_ = params_.block_size * 3;
     mask_ = (1 << params_.bit_depth) - 1;
@@ -243,6 +259,22 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d45_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d45_predictor_32x32_neon,
                        &vpx_d45_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d63_predictor_4x4_neon, &vpx_d63_predictor_4x4_c, 4,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_8x8_neon, &vpx_d63_predictor_8x8_c, 8,
+                       8),
+        IntraPredParam(&vpx_d63_predictor_16x16_neon,
+                       &vpx_d63_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d63_predictor_32x32_neon,
+                       &vpx_d63_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d117_predictor_4x4_neon, &vpx_d117_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d117_predictor_8x8_neon, &vpx_d117_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d117_predictor_16x16_neon,
+                       &vpx_d117_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d117_predictor_32x32_neon,
+                       &vpx_d117_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_d135_predictor_4x4_neon, &vpx_d135_predictor_4x4_c,
                        4, 8),
         IntraPredParam(&vpx_d135_predictor_8x8_neon, &vpx_d135_predictor_8x8_c,
@@ -251,6 +283,22 @@ INSTANTIATE_TEST_SUITE_P(
                        &vpx_d135_predictor_16x16_c, 16, 8),
         IntraPredParam(&vpx_d135_predictor_32x32_neon,
                        &vpx_d135_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d153_predictor_4x4_neon, &vpx_d153_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d153_predictor_8x8_neon, &vpx_d153_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d153_predictor_16x16_neon,
+                       &vpx_d153_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d153_predictor_32x32_neon,
+                       &vpx_d153_predictor_32x32_c, 32, 8),
+        IntraPredParam(&vpx_d207_predictor_4x4_neon, &vpx_d207_predictor_4x4_c,
+                       4, 8),
+        IntraPredParam(&vpx_d207_predictor_8x8_neon, &vpx_d207_predictor_8x8_c,
+                       8, 8),
+        IntraPredParam(&vpx_d207_predictor_16x16_neon,
+                       &vpx_d207_predictor_16x16_c, 16, 8),
+        IntraPredParam(&vpx_d207_predictor_32x32_neon,
+                       &vpx_d207_predictor_32x32_c, 32, 8),
         IntraPredParam(&vpx_dc_128_predictor_4x4_neon,
                        &vpx_dc_128_predictor_4x4_c, 4, 8),
         IntraPredParam(&vpx_dc_128_predictor_8x8_neon,
@@ -441,6 +489,15 @@ INSTANTIATE_TEST_SUITE_P(
                                      &vpx_v_predictor_32x32_c, 32, 8)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9IntraPredTest,
+    ::testing::Values(IntraPredParam(&vpx_dc_predictor_8x8_lsx,
+                                     &vpx_dc_predictor_8x8_c, 8, 8),
+                      IntraPredParam(&vpx_dc_predictor_16x16_lsx,
+                                     &vpx_dc_predictor_16x16_c, 16, 8)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
                                 const uint16_t *above, const uint16_t *left,
@@ -832,6 +889,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -840,6 +913,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 8),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 8),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 8),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -908,6 +997,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -916,6 +1021,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 10),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
@@ -984,6 +1105,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d45_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d45_predictor_32x32_neon,
                              &vpx_highbd_d45_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_4x4_neon,
+                             &vpx_highbd_d63_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_8x8_neon,
+                             &vpx_highbd_d63_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_16x16_neon,
+                             &vpx_highbd_d63_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d63_predictor_32x32_neon,
+                             &vpx_highbd_d63_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_4x4_neon,
+                             &vpx_highbd_d117_predictor_4x4_c, 4, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_8x8_neon,
+                             &vpx_highbd_d117_predictor_8x8_c, 8, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_16x16_neon,
+                             &vpx_highbd_d117_predictor_16x16_c, 16, 10),
+        HighbdIntraPredParam(&vpx_highbd_d117_predictor_32x32_neon,
+                             &vpx_highbd_d117_predictor_32x32_c, 32, 10),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_4x4_neon,
                              &vpx_highbd_d135_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_8x8_neon,
@@ -992,6 +1129,22 @@ INSTANTIATE_TEST_SUITE_P(
                              &vpx_highbd_d135_predictor_16x16_c, 16, 12),
         HighbdIntraPredParam(&vpx_highbd_d135_predictor_32x32_neon,
                              &vpx_highbd_d135_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_4x4_neon,
+                             &vpx_highbd_d153_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_8x8_neon,
+                             &vpx_highbd_d153_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_16x16_neon,
+                             &vpx_highbd_d153_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d153_predictor_32x32_neon,
+                             &vpx_highbd_d153_predictor_32x32_c, 32, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_neon,
+                             &vpx_highbd_d207_predictor_4x4_c, 4, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_neon,
+                             &vpx_highbd_d207_predictor_8x8_c, 8, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_neon,
+                             &vpx_highbd_d207_predictor_16x16_c, 16, 12),
+        HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_neon,
+                             &vpx_highbd_d207_predictor_32x32_c, 32, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_4x4_neon,
                              &vpx_highbd_dc_128_predictor_4x4_c, 4, 12),
         HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_8x8_neon,
diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc
index 931ac30a3..fe3cd1aba 100644
--- a/test/vp9_lossless_test.cc
+++ b/test/vp9_lossless_test.cc
@@ -29,15 +29,15 @@ class LosslessTest
       : EncoderTest(GET_PARAM(0)), psnr_(kMaxPsnr), nframes_(0),
         encoding_mode_(GET_PARAM(1)) {}
 
-  virtual ~LosslessTest() {}
+  ~LosslessTest() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       // Only call Control if quantizer > 0 to verify that using quantizer
       // alone will activate lossless
@@ -47,12 +47,12 @@ class LosslessTest
     }
   }
 
-  virtual void BeginPassHook(unsigned int /*pass*/) {
+  void BeginPassHook(unsigned int /*pass*/) override {
     psnr_ = kMaxPsnr;
     nframes_ = 0;
   }
 
-  virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) {
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
     if (pkt->data.psnr.psnr[0] < psnr_) psnr_ = pkt->data.psnr.psnr[0];
   }
 
diff --git a/test/vp9_motion_vector_test.cc b/test/vp9_motion_vector_test.cc
index 6b1082a10..495ea11fc 100644
--- a/test/vp9_motion_vector_test.cc
+++ b/test/vp9_motion_vector_test.cc
@@ -42,9 +42,9 @@ class MotionVectorTestLarge
       : EncoderTest(GET_PARAM(0)), encoding_mode_(GET_PARAM(1)),
         cpu_used_(GET_PARAM(2)), mv_test_mode_(GET_PARAM(3)) {}
 
-  virtual ~MotionVectorTestLarge() {}
+  ~MotionVectorTestLarge() override = default;
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(encoding_mode_);
     if (encoding_mode_ != ::libvpx_test::kRealTime) {
@@ -59,8 +59,8 @@ class MotionVectorTestLarge
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
       encoder->Control(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, mv_test_mode_);
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 587cec692..e00ab4022 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -26,6 +26,7 @@
 #include "test/util.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
@@ -38,33 +39,44 @@ namespace {
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             const int16_t *zbin, const int16_t *round,
-                             const int16_t *quant, const int16_t *quant_shift,
+                             const macroblock_plane *mb_plane,
                              tran_low_t *qcoeff, tran_low_t *dqcoeff,
                              const int16_t *dequant, uint16_t *eob,
-                             const int16_t *scan, const int16_t *iscan);
+                             const struct ScanOrder *const scan_order);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
+// Wrapper for 32x32 version which does not use count
+typedef void (*Quantize32x32Func)(const tran_low_t *coeff,
+                                  const macroblock_plane *const mb_plane,
+                                  tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                  const int16_t *dequant, uint16_t *eob,
+                                  const struct ScanOrder *const scan_order);
+
+template <Quantize32x32Func fn>
+void Quant32x32Wrapper(const tran_low_t *coeff, intptr_t count,
+                       const macroblock_plane *const mb_plane,
+                       tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                       const int16_t *dequant, uint16_t *eob,
+                       const struct ScanOrder *const scan_order) {
+  (void)count;
+  fn(coeff, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
+}
+
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
-                               const int16_t *round, const int16_t *quant,
+                               const macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff, tran_low_t *dqcoeff,
                                const int16_t *dequant, uint16_t *eob,
-                               const int16_t *scan, const int16_t *iscan);
+                               const struct ScanOrder *const scan_order);
 
 template <QuantizeFPFunc fn>
 void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
-                    const int16_t *zbin, const int16_t *round,
-                    const int16_t *quant, const int16_t *quant_shift,
-                    tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                    const int16_t *dequant, uint16_t *eob, const int16_t *scan,
-                    const int16_t *iscan) {
-  (void)zbin;
-  (void)quant_shift;
-
-  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
+                    const macroblock_plane *const mb_plane, tran_low_t *qcoeff,
+                    tran_low_t *dqcoeff, const int16_t *dequant, uint16_t *eob,
+                    const struct ScanOrder *const scan_order) {
+  fn(coeff, count, mb_plane, qcoeff, dqcoeff, dequant, eob, scan_order);
 }
 
 void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
@@ -119,17 +131,21 @@ class VP9QuantizeBase : public AbstractBench {
 #else
     max_value_ = (1 << bit_depth_) - 1;
 #endif
-    zbin_ptr_ =
+
+    mb_plane_ = reinterpret_cast<macroblock_plane *>(
+        vpx_memalign(16, sizeof(macroblock_plane)));
+
+    zbin_ptr_ = mb_plane_->zbin =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
-    round_fp_ptr_ = reinterpret_cast<int16_t *>(
+    round_fp_ptr_ = mb_plane_->round_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*round_fp_ptr_)));
-    quant_fp_ptr_ = reinterpret_cast<int16_t *>(
+    quant_fp_ptr_ = mb_plane_->quant_fp = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_fp_ptr_)));
-    round_ptr_ =
+    round_ptr_ = mb_plane_->round =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*round_ptr_)));
-    quant_ptr_ =
+    quant_ptr_ = mb_plane_->quant =
         reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*quant_ptr_)));
-    quant_shift_ptr_ = reinterpret_cast<int16_t *>(
+    quant_shift_ptr_ = mb_plane_->quant_shift = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
     dequant_ptr_ = reinterpret_cast<int16_t *>(
         vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
@@ -138,7 +154,8 @@ class VP9QuantizeBase : public AbstractBench {
     q_ptr_ = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
   }
 
-  ~VP9QuantizeBase() {
+  ~VP9QuantizeBase() override {
+    vpx_free(mb_plane_);
     vpx_free(zbin_ptr_);
     vpx_free(round_fp_ptr_);
     vpx_free(quant_fp_ptr_);
@@ -146,6 +163,7 @@ class VP9QuantizeBase : public AbstractBench {
     vpx_free(quant_ptr_);
     vpx_free(quant_shift_ptr_);
     vpx_free(dequant_ptr_);
+    mb_plane_ = nullptr;
     zbin_ptr_ = nullptr;
     round_fp_ptr_ = nullptr;
     quant_fp_ptr_ = nullptr;
@@ -157,9 +175,10 @@ class VP9QuantizeBase : public AbstractBench {
   }
 
  protected:
+  macroblock_plane *mb_plane_;
   int16_t *zbin_ptr_;
-  int16_t *round_fp_ptr_;
   int16_t *quant_fp_ptr_;
+  int16_t *round_fp_ptr_;
   int16_t *round_ptr_;
   int16_t *quant_ptr_;
   int16_t *quant_shift_ptr_;
@@ -174,7 +193,7 @@ class VP9QuantizeBase : public AbstractBench {
   int16_t *r_ptr_;
   int16_t *q_ptr_;
   int count_;
-  const scan_order *scan_;
+  const ScanOrder *scan_;
   uint16_t eob_;
 };
 
@@ -186,17 +205,15 @@ class VP9QuantizeTest : public VP9QuantizeBase,
         quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
 
  protected:
-  virtual void Run();
+  void Run() override;
   void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
-               scan_->iscan);
+  quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+               dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_);
 }
 
 void VP9QuantizeTest::Speed(bool is_median) {
@@ -266,19 +283,18 @@ void VP9QuantizeTest::Speed(bool is_median) {
 
         vpx_usec_timer_start(&timer);
         for (int n = 0; n < kNumTests; ++n) {
-          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
-                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                           ref_qcoeff.TopLeftPixel(),
                            ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                           scan_->scan, scan_->iscan);
+                           scan_);
         }
         vpx_usec_timer_mark(&timer);
 
         vpx_usec_timer_start(&simd_timer);
         for (int n = 0; n < kNumTests; ++n) {
-          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
-                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
-                       scan_->scan, scan_->iscan);
+          quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                       qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
+                       dequant_ptr_, &eob_, scan_);
         }
         vpx_usec_timer_mark(&simd_timer);
 
@@ -298,14 +314,16 @@ void VP9QuantizeTest::Speed(bool is_median) {
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const struct macroblock_plane *const mb_plane,
                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                        const int16_t *scan, const int16_t *iscan,
+                        const struct ScanOrder *const scan_order,
                         int is_32x32) {
   int i, eob = -1;
   const int thr = dequant_ptr[1] >> (1 + is_32x32);
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
@@ -372,21 +390,21 @@ inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      const struct macroblock_plane *mb_plane,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+                      const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 0);
 }
 
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const struct macroblock_plane *mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
-              dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+                            const struct ScanOrder *const scan_order) {
+  quant_fp_nz(coeff_ptr, n_coeffs, mb_plane, qcoeff_ptr, dqcoeff_ptr,
+              dequant_ptr, eob_ptr, scan_order, 1);
 }
 
 TEST_P(VP9QuantizeTest, OperationCheck) {
@@ -417,15 +435,13 @@ TEST_P(VP9QuantizeTest, OperationCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -475,15 +491,13 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
-                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
-                     scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, mb_plane_,
+                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
+                     dequant_ptr_, &ref_eob, scan_);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
-        quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
-        dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
+        coeff_.TopLeftPixel(), count_, mb_plane_, qcoeff_.TopLeftPixel(),
+        dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_));
 
     EXPECT_TRUE(qcoeff_.CheckValues(ref_qcoeff));
     EXPECT_TRUE(dqcoeff_.CheckValues(ref_dqcoeff));
@@ -510,27 +524,30 @@ using std::make_tuple;
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
     ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                    &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_sse2, vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_sse2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+    ::testing::Values(make_tuple(vpx_quantize_b_sse2, vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
@@ -541,11 +558,11 @@ INSTANTIATE_TEST_SUITE_P(
 #if HAVE_SSSE3
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
+    ::testing::Values(make_tuple(vpx_quantize_b_ssse3, vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_ssse3>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_ssse3>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
                                  16, true),
@@ -555,13 +572,13 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
-INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_avx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_avx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(vpx_quantize_b_avx, vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
@@ -577,22 +594,26 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
                    &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
                    32, true),
-        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+        make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_avx2, vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
                    VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_avx2>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
@@ -602,11 +623,11 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true),
-                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+                      make_tuple(vpx_quantize_b_avx2, vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_avx2,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_avx2>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
@@ -617,20 +638,24 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
                    VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+        make_tuple(vpx_highbd_quantize_b_neon, vpx_highbd_quantize_b_c,
                    VPX_BITS_12, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
                    VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_neon>,
+                   &Quant32x32Wrapper<vpx_highbd_quantize_b_32x32_c>,
+                   VPX_BITS_12, 32, false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
@@ -641,9 +666,9 @@ INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
                                  VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_neon,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_neon>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false),
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8,
                                  16, true),
@@ -670,13 +695,13 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
-                         ::testing::Values(make_tuple(&vpx_quantize_b_lsx,
-                                                      &vpx_quantize_b_c,
-                                                      VPX_BITS_8, 16, false),
-                                           make_tuple(&vpx_quantize_b_32x32_lsx,
-                                                      &vpx_quantize_b_32x32_c,
-                                                      VPX_BITS_8, 32, false)));
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VP9QuantizeTest,
+    ::testing::Values(make_tuple(&vpx_quantize_b_lsx, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_lsx>,
+                                 &Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                                 VPX_BITS_8, 32, false)));
 #endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
 
 // Only useful to compare "Speed" test results.
@@ -684,8 +709,9 @@ INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
     ::testing::Values(
         make_tuple(&vpx_quantize_b_c, &vpx_quantize_b_c, VPX_BITS_8, 16, false),
-        make_tuple(&vpx_quantize_b_32x32_c, &vpx_quantize_b_32x32_c, VPX_BITS_8,
-                   32, false),
+        make_tuple(&Quant32x32Wrapper<vpx_quantize_b_32x32_c>,
+                   &Quant32x32Wrapper<vpx_quantize_b_32x32_c>, VPX_BITS_8, 32,
+                   false),
         make_tuple(&QuantFPWrapper<vp9_quantize_fp_c>,
                    &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
         make_tuple(&QuantFPWrapper<quantize_fp_nz_c>,
diff --git a/test/vp9_ratectrl_rtc_test.cc b/test/vp9_ratectrl_rtc_test.cc
index 1d1a78f43..f7be47542 100644
--- a/test/vp9_ratectrl_rtc_test.cc
+++ b/test/vp9_ratectrl_rtc_test.cc
@@ -31,6 +31,7 @@ const int kTemporalId2Layer[2] = { 0, 1 };
 const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
 const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
 const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
+const int kSpatialLayerBitrateLow[3] = { 50, 100, 400 };
 
 class RcInterfaceTest
     : public ::libvpx_test::EncoderTest,
@@ -38,28 +39,34 @@ class RcInterfaceTest
  public:
   RcInterfaceTest()
       : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
-        encoder_exit_(false) {}
+        encoder_exit_(false), frame_drop_thresh_(0), num_drops_(0) {}
 
-  virtual ~RcInterfaceTest() {}
+  ~RcInterfaceTest() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
-      encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
+      if (rc_cfg_.is_screen) {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN);
+      } else {
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT);
+      }
       encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
       encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
-    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
+    if (rc_cfg_.rc_mode == VPX_CBR &&
+        frame_params_.frame_type == libvpx::RcFrameType::kInterFrame) {
       // Disable golden frame update.
       frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
       frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
@@ -67,20 +74,23 @@ class RcInterfaceTest
     encoder_exit_ = video->frame() == kNumFrames;
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
     if (encoder_exit_) {
       return;
     }
     int loopfilter_level, qp;
     encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
     encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-    rc_api_->ComputeQP(frame_params_);
-    ASSERT_EQ(rc_api_->GetQP(), qp);
-    ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kOk) {
+      ASSERT_EQ(rc_api_->GetQP(), qp);
+      ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+    } else {
+      num_drops_++;
+    }
   }
 
-  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
-    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  void FramePktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz, frame_params_);
   }
 
   void RunOneLayer() {
@@ -95,6 +105,42 @@ class RcInterfaceTest
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunOneLayerScreen() {
+    SetConfig(GET_PARAM(2));
+    rc_cfg_.is_screen = true;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunOneLayerDropFramesCBR() {
+    if (GET_PARAM(2) != VPX_CBR) {
+      GTEST_SKIP() << "Frame dropping is only for CBR mode.";
+    }
+    frame_drop_thresh_ = 30;
+    SetConfig(GET_PARAM(2));
+    // Use lower bitrate, lower max-q, and enable frame dropper.
+    rc_cfg_.target_bandwidth = 200;
+    cfg_.rc_target_bitrate = 200;
+    rc_cfg_.max_quantizer = 50;
+    cfg_.rc_max_quantizer = 50;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    frame_params_.spatial_layer_id = 0;
+    frame_params_.temporal_layer_id = 0;
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunOneLayerVBRPeriodicKey() {
     if (GET_PARAM(2) != VPX_VBR) return;
     key_interval_ = 100;
@@ -132,6 +178,7 @@ class RcInterfaceTest
     rc_cfg_.min_quantizers[0] = 2;
     rc_cfg_.rc_mode = rc_mode;
     rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
 
     // Encoder settings for ground truth.
     cfg_.g_w = 1280;
@@ -150,6 +197,7 @@ class RcInterfaceTest
     cfg_.rc_target_bitrate = 1000;
     cfg_.kf_min_dist = key_interval_;
     cfg_.kf_max_dist = key_interval_;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
   }
 
   std::unique_ptr<libvpx::VP9RateControlRTC> rc_api_;
@@ -158,23 +206,31 @@ class RcInterfaceTest
   int key_interval_;
   libvpx::VP9FrameParamsQpRTC frame_params_;
   bool encoder_exit_;
+  int frame_drop_thresh_;
+  int num_drops_;
 };
 
-class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
-                           public ::libvpx_test::CodecTestWithParam<int> {
+class RcInterfaceSvcTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, bool> {
  public:
-  RcInterfaceSvcTest() : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)) {}
-  virtual ~RcInterfaceSvcTest() {}
+  RcInterfaceSvcTest()
+      : EncoderTest(GET_PARAM(0)), aq_mode_(GET_PARAM(1)), key_interval_(3000),
+        dynamic_spatial_layers_(0), inter_layer_pred_off_(GET_PARAM(2)),
+        parallel_spatial_layers_(false), frame_drop_thresh_(0),
+        max_consec_drop_(INT_MAX), num_drops_(0) {}
+  ~RcInterfaceSvcTest() override = default;
 
  protected:
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
   }
 
-  virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
+      current_superframe_ = 0;
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, aq_mode_);
       encoder->Control(VP9E_SET_TUNE_CONTENT, 0);
@@ -182,11 +238,23 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_RTC_EXTERNAL_RATECTRL, 1);
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
+      if (inter_layer_pred_off_) {
+        encoder->Control(VP9E_SET_SVC_INTER_LAYER_PRED,
+                         INTER_LAYER_PRED_OFF_NONKEY);
+      }
+      if (frame_drop_thresh_ > 0) {
+        vpx_svc_frame_drop_t svc_drop_frame;
+        svc_drop_frame.framedrop_mode = FULL_SUPERFRAME_DROP;
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl)
+          svc_drop_frame.framedrop_thresh[sl] = frame_drop_thresh_;
+        svc_drop_frame.max_consec_drop = max_consec_drop_;
+        encoder->Control(VP9E_SET_SVC_FRAME_DROP_LAYER, &svc_drop_frame);
+      }
     }
-    frame_params_.frame_type =
-        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    frame_params_.frame_type = video->frame() % key_interval_ == 0
+                                   ? libvpx::RcFrameType::kKeyFrame
+                                   : libvpx::RcFrameType::kInterFrame;
     encoder_exit_ = video->frame() == kNumFrames;
-    current_superframe_ = video->frame();
     if (dynamic_spatial_layers_ == 1) {
       if (video->frame() == 100) {
         // Go down to 2 spatial layers: set top SL to 0 bitrate.
@@ -201,7 +269,7 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
         rc_cfg_.layer_target_bitrate[6] = 0;
         rc_cfg_.layer_target_bitrate[7] = 0;
         rc_cfg_.layer_target_bitrate[8] = 0;
-        rc_api_->UpdateRateControl(rc_cfg_);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
       } else if (video->frame() == 200) {
         // Go down to 1 spatial layer.
         // Update the encoder config.
@@ -215,8 +283,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
         rc_cfg_.layer_target_bitrate[3] = 0;
         rc_cfg_.layer_target_bitrate[4] = 0;
         rc_cfg_.layer_target_bitrate[5] = 0;
-        rc_api_->UpdateRateControl(rc_cfg_);
-      } else if (0 && video->frame() == 280) {
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
+      } else if (/*DISABLES CODE*/ (false) && video->frame() == 280) {
         // TODO(marpan): Re-enable this going back up when issue is fixed.
         // Go back up to 3 spatial layers.
         // Update the encoder config: use the original bitrates.
@@ -224,52 +292,92 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
         encoder->Config(&cfg_);
         // Update the RC config.
         SetRCConfigSvc(3, 3);
-        rc_api_->UpdateRateControl(rc_cfg_);
+        ASSERT_TRUE(rc_api_->UpdateRateControl(rc_cfg_));
       }
     }
   }
 
-  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+  virtual void SetFrameParamsSvc(int sl) {
+    frame_params_.spatial_layer_id = sl;
+    if (rc_cfg_.ts_number_layers == 3)
+      frame_params_.temporal_layer_id =
+          kTemporalId3Layer[current_superframe_ % 4];
+    else if (rc_cfg_.ts_number_layers == 2)
+      frame_params_.temporal_layer_id =
+          kTemporalId2Layer[current_superframe_ % 2];
+    else
+      frame_params_.temporal_layer_id = 0;
+    frame_params_.frame_type =
+        current_superframe_ % key_interval_ == 0 && sl == 0
+            ? libvpx::RcFrameType::kKeyFrame
+            : libvpx::RcFrameType::kInterFrame;
+  }
+
+  void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
+    if (encoder_exit_) {
+      return;
+    }
+    int superframe_is_dropped = false;
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
+    std::vector<int> rc_qp;
+    // For FULL_SUPERFRAME_DROP: the full superframe drop decision is
+    // determined on the base spatial layer.
+    SetFrameParamsSvc(0);
+    if (rc_api_->ComputeQP(frame_params_) == libvpx::FrameDropDecision::kDrop) {
+      superframe_is_dropped = true;
+      num_drops_++;
+    }
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
+      ASSERT_EQ(superframe_is_dropped, false);
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
-      for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
-        if (sizes_[sl] > 0) {
-          frame_params_.spatial_layer_id = sl;
-          if (rc_cfg_.ts_number_layers == 3)
-            frame_params_.temporal_layer_id =
-                kTemporalId3Layer[current_superframe_ % 4];
-          else if (rc_cfg_.ts_number_layers == 2)
-            frame_params_.temporal_layer_id =
-                kTemporalId2Layer[current_superframe_ % 2];
-          else
-            frame_params_.temporal_layer_id = 0;
-          rc_api_->ComputeQP(frame_params_);
-          frame_params_.frame_type = INTER_FRAME;
-          rc_api_->PostEncodeUpdate(sizes_[sl]);
+      if (!parallel_spatial_layers_ || current_superframe_ == 0) {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            // For sl=0 ComputeQP() is already called above (line 310).
+            if (sl > 0) rc_api_->ComputeQP(frame_params_);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
+        }
+      } else {
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          // For sl=0 ComputeQP() is already called above (line 310).
+          if (sizes_[sl] > 0 && sl > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->ComputeQP(frame_params_);
+          }
+        }
+        for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
+          if (sizes_[sl] > 0) {
+            SetFrameParamsSvc(sl);
+            rc_api_->PostEncodeUpdate(sizes_[sl], frame_params_);
+            rc_qp.push_back(rc_api_->GetQP());
+          }
         }
       }
     }
-    if (!encoder_exit_) {
-      int loopfilter_level, qp;
+    if (!superframe_is_dropped) {
+      int loopfilter_level;
+      std::vector<int> encoder_qp(VPX_SS_MAX_LAYERS, 0);
       encoder->Control(VP9E_GET_LOOPFILTER_LEVEL, &loopfilter_level);
-      encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
-      ASSERT_EQ(rc_api_->GetQP(), qp);
+      encoder->Control(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, encoder_qp.data());
+      encoder_qp.resize(rc_qp.size());
+      ASSERT_EQ(rc_qp, encoder_qp);
       ASSERT_EQ(rc_api_->GetLoopfilterLevel(), loopfilter_level);
+      current_superframe_++;
     }
   }
   // This method needs to be overridden because non-reference frames are
   // expected to be mismatched frames as the encoder will avoid loopfilter on
   // these frames.
-  virtual void MismatchHook(const vpx_image_t * /*img1*/,
-                            const vpx_image_t * /*img2*/) {}
+  void MismatchHook(const vpx_image_t * /*img1*/,
+                    const vpx_image_t * /*img2*/) override {}
 
   void RunSvc() {
-    dynamic_spatial_layers_ = 0;
     SetRCConfigSvc(3, 3);
-    key_interval_ = 10000;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     SetEncoderConfigSvc(3, 3);
 
@@ -279,8 +387,22 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void RunSvcDropFramesCBR() {
+    max_consec_drop_ = 10;
+    frame_drop_thresh_ = 30;
+    SetRCConfigSvc(3, 3);
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    // Check that some frames were dropped, otherwise test has no value.
+    ASSERT_GE(num_drops_, 1);
+  }
+
   void RunSvcPeriodicKey() {
-    dynamic_spatial_layers_ = 0;
     SetRCConfigSvc(3, 3);
     key_interval_ = 100;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
@@ -295,7 +417,19 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   void RunSvcDynamicSpatial() {
     dynamic_spatial_layers_ = 1;
     SetRCConfigSvc(3, 3);
-    key_interval_ = 10000;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcParallelSpatialLayers() {
+    if (!inter_layer_pred_off_) return;
+    parallel_spatial_layers_ = true;
+    SetRCConfigSvc(3, 3);
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
     SetEncoderConfigSvc(3, 3);
 
@@ -391,12 +525,14 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     cfg_.kf_max_dist = 9999;
     cfg_.rc_overshoot_pct = 50;
     cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_dropframe_thresh = frame_drop_thresh_;
 
     cfg_.rc_target_bitrate = 0;
     for (int sl = 0; sl < number_spatial_layers; sl++) {
       int spatial_bitrate = 0;
       if (number_spatial_layers <= 3)
-        spatial_bitrate = kSpatialLayerBitrate[sl];
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
       for (int tl = 0; tl < number_temporal_layers; tl++) {
         int layer = sl * number_temporal_layers + tl;
         if (number_temporal_layers == 3)
@@ -431,6 +567,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     rc_cfg_.framerate = 30.0;
     rc_cfg_.rc_mode = VPX_CBR;
     rc_cfg_.aq_mode = aq_mode_;
+    rc_cfg_.frame_drop_thresh = frame_drop_thresh_;
+    rc_cfg_.max_consec_drop = max_consec_drop_;
 
     if (number_spatial_layers == 3) {
       rc_cfg_.scaling_factor_num[0] = 1;
@@ -464,7 +602,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     for (int sl = 0; sl < number_spatial_layers; sl++) {
       int spatial_bitrate = 0;
       if (number_spatial_layers <= 3)
-        spatial_bitrate = kSpatialLayerBitrate[sl];
+        spatial_bitrate = frame_drop_thresh_ > 0 ? kSpatialLayerBitrateLow[sl]
+                                                 : kSpatialLayerBitrate[sl];
       for (int tl = 0; tl < number_temporal_layers; tl++) {
         int layer = sl * number_temporal_layers + tl;
         if (number_temporal_layers == 3)
@@ -498,19 +637,36 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   uint32_t sizes_[8];
   int key_interval_;
   int dynamic_spatial_layers_;
+  bool inter_layer_pred_off_;
+  // ComputeQP() and PostEncodeUpdate() don't need to be sequential for KSVC.
+  bool parallel_spatial_layers_;
+  int frame_drop_thresh_;
+  int max_consec_drop_;
+  int num_drops_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
 
+TEST_P(RcInterfaceTest, OneLayerDropFramesCBR) { RunOneLayerDropFramesCBR(); }
+
+TEST_P(RcInterfaceTest, OneLayerScreen) { RunOneLayerScreen(); }
+
 TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceSvcTest, SvcDropFramesCBR) { RunSvcDropFramesCBR(); }
+
+TEST_P(RcInterfaceSvcTest, SvcParallelSpatialLayers) {
+  RunSvcParallelSpatialLayers();
+}
+
 TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
 
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
-VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3),
+                           ::testing::Values(true, false));
 }  // namespace
diff --git a/test/vp9_roi_test.cc b/test/vp9_roi_test.cc
index e8373c4c0..a9347fb36 100644
--- a/test/vp9_roi_test.cc
+++ b/test/vp9_roi_test.cc
@@ -84,9 +84,9 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
                               public ::testing::Test {
  protected:
   RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
-  virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); }
+  ~RoiMaskBackgroundSkip() override { free(roi_.roi_map); }
 
-  virtual void SetUp() {
+  void SetUp() override {
     InitializeConfig();
     SetMode(::libvpx_test::kRealTime);
     SetRoi();
@@ -114,8 +114,8 @@ class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
     }
   }
 
-  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
-                                  ::libvpx_test::Encoder *encoder) {
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, 7);
       encoder->Control(VP9E_SET_AQ_MODE, 3);
diff --git a/test/vp9_scale_test.cc b/test/vp9_scale_test.cc
index 2d1203fb8..049a10a61 100644
--- a/test/vp9_scale_test.cc
+++ b/test/vp9_scale_test.cc
@@ -33,10 +33,10 @@ typedef void (*ScaleFrameFunc)(const YV12_BUFFER_CONFIG *src,
 class ScaleTest : public VpxScaleBase,
                   public ::testing::TestWithParam<ScaleFrameFunc> {
  public:
-  virtual ~ScaleTest() {}
+  ~ScaleTest() override = default;
 
  protected:
-  virtual void SetUp() { scale_fn_ = GetParam(); }
+  void SetUp() override { scale_fn_ = GetParam(); }
 
   void ReferenceScaleFrame(INTERP_FILTER filter_type, int phase_scaler) {
     vp9_scale_and_extend_frame_c(&img_, &ref_img_, filter_type, phase_scaler);
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index a57082f1e..78deb5190 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -34,10 +34,10 @@ namespace vp9 {
 class VP9SubtractBlockTest : public AbstractBench,
                              public ::testing::TestWithParam<SubtractFunc> {
  public:
-  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+  void TearDown() override { libvpx_test::ClearSystemState(); }
 
  protected:
-  virtual void Run() {
+  void Run() override {
     GetParam()(block_height_, block_width_, diff_, block_width_, src_,
                block_width_, pred_, block_width_);
   }
@@ -176,7 +176,7 @@ using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
 
 class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
     block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
     bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
@@ -198,7 +198,7 @@ class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
     ASSERT_NE(diff_, nullptr);
   }
 
-  virtual void TearDown() {
+  void TearDown() override {
     vpx_free(CONVERT_TO_SHORTPTR(src_));
     vpx_free(CONVERT_TO_SHORTPTR(pred_));
     vpx_free(diff_);
diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index 1ceef8185..c0cea681d 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc
@@ -26,10 +26,10 @@ using std::string;
 
 class VPxWorkerThreadTest : public ::testing::TestWithParam<bool> {
  protected:
-  virtual ~VPxWorkerThreadTest() {}
-  virtual void SetUp() { vpx_get_worker_interface()->init(&worker_); }
+  ~VPxWorkerThreadTest() override = default;
+  void SetUp() override { vpx_get_worker_interface()->init(&worker_); }
 
-  virtual void TearDown() { vpx_get_worker_interface()->end(&worker_); }
+  void TearDown() override { vpx_get_worker_interface()->end(&worker_); }
 
   void Run(VPxWorker *worker) {
     const bool synchronous = GetParam();
diff --git a/test/vpx_scale_test.cc b/test/vpx_scale_test.cc
index 7eea437fc..3897a6088 100644
--- a/test/vpx_scale_test.cc
+++ b/test/vpx_scale_test.cc
@@ -38,10 +38,10 @@ class ExtendBorderTest
     : public VpxScaleBase,
       public ::testing::TestWithParam<ExtendFrameBorderFunc> {
  public:
-  virtual ~ExtendBorderTest() {}
+  ~ExtendBorderTest() override = default;
 
  protected:
-  virtual void SetUp() { extend_fn_ = GetParam(); }
+  void SetUp() override { extend_fn_ = GetParam(); }
 
   void ExtendBorder() { ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); }
 
@@ -68,10 +68,10 @@ INSTANTIATE_TEST_SUITE_P(C, ExtendBorderTest,
 class CopyFrameTest : public VpxScaleBase,
                       public ::testing::TestWithParam<CopyFrameFunc> {
  public:
-  virtual ~CopyFrameTest() {}
+  ~CopyFrameTest() override = default;
 
  protected:
-  virtual void SetUp() { copy_frame_fn_ = GetParam(); }
+  void SetUp() override { copy_frame_fn_ = GetParam(); }
 
   void CopyFrame() {
     ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &dst_img_));
diff --git a/test/webm_video_source.h b/test/webm_video_source.h
index d24592629..6ab50c849 100644
--- a/test/webm_video_source.h
+++ b/test/webm_video_source.h
@@ -29,16 +29,16 @@ class WebMVideoSource : public CompressedVideoSource {
         webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
-  virtual ~WebMVideoSource() {
+  ~WebMVideoSource() override {
     if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
   }
 
-  virtual void Init() {}
+  void Init() override {}
 
-  virtual void Begin() {
+  void Begin() override {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
     ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
@@ -48,7 +48,7 @@ class WebMVideoSource : public CompressedVideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
@@ -74,11 +74,11 @@ class WebMVideoSource : public CompressedVideoSource {
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const {
+  const uint8_t *cxdata() const override {
     return end_of_file_ ? nullptr : buf_;
   }
-  virtual size_t frame_size() const { return buf_sz_; }
-  virtual unsigned int frame_number() const { return frame_; }
+  size_t frame_size() const override { return buf_sz_; }
+  unsigned int frame_number() const override { return frame_; }
 
  protected:
   std::string file_name_;
diff --git a/test/y4m_test.cc b/test/y4m_test.cc
index 32f2cd51d..78a944fd0 100644
--- a/test/y4m_test.cc
+++ b/test/y4m_test.cc
@@ -78,7 +78,7 @@ class Y4mVideoSourceTest : public ::testing::TestWithParam<Y4mTestParam>,
  protected:
   Y4mVideoSourceTest() : Y4mVideoSource("", 0, 0) {}
 
-  virtual ~Y4mVideoSourceTest() { CloseSource(); }
+  ~Y4mVideoSourceTest() override { CloseSource(); }
 
   virtual void Init(const std::string &file_name, int limit) {
     file_name_ = file_name;
@@ -140,7 +140,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
  protected:
   Y4mVideoWriteTest() : tmpfile_(nullptr) {}
 
-  virtual ~Y4mVideoWriteTest() {
+  ~Y4mVideoWriteTest() override {
     delete tmpfile_;
     input_file_ = nullptr;
   }
@@ -172,7 +172,7 @@ class Y4mVideoWriteTest : public Y4mVideoSourceTest {
     ReplaceInputFile(tmpfile_->file());
   }
 
-  virtual void Init(const std::string &file_name, int limit) {
+  void Init(const std::string &file_name, int limit) override {
     Y4mVideoSourceTest::Init(file_name, limit);
     WriteY4mAndReadBack();
   }
diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h
index 71fbf3193..e43e37d9e 100644
--- a/test/y4m_video_source.h
+++ b/test/y4m_video_source.h
@@ -27,7 +27,7 @@ class Y4mVideoSource : public VideoSource {
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
-  virtual ~Y4mVideoSource() {
+  ~Y4mVideoSource() override {
     vpx_img_free(img_.get());
     CloseSource();
   }
@@ -51,33 +51,33 @@ class Y4mVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Begin() {
+  void Begin() override {
     OpenSource();
     ReadSourceToStart();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void FillFrame() {
     ASSERT_NE(input_file_, nullptr);
diff --git a/test/yuv_temporal_filter_test.cc b/test/yuv_temporal_filter_test.cc
index 2bdcf4d86..0677d5568 100644
--- a/test/yuv_temporal_filter_test.cc
+++ b/test/yuv_temporal_filter_test.cc
@@ -290,7 +290,7 @@ void ApplyReferenceFilter(
 class YUVTemporalFilterTest
     : public ::testing::TestWithParam<TemporalFilterWithBd> {
  public:
-  virtual void SetUp() {
+  void SetUp() override {
     filter_func_ = GetParam().temporal_filter;
     bd_ = GetParam().bd;
     use_highbd_ = (bd_ != 8);
@@ -694,6 +694,18 @@ INSTANTIATE_TEST_SUITE_P(
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_sse4_1_12,
                              12)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_neon, 12)
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, YUVTemporalFilterTest,
+    ::testing::Values(
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_10,
+                             10),
+        TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_neon_12,
+                             12)));
+#endif  // HAVE_NEON
 #else
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -704,5 +716,11 @@ INSTANTIATE_TEST_SUITE_P(SSE4_1, YUVTemporalFilterTest,
                          ::testing::Values(TemporalFilterWithBd(
                              &vp9_apply_temporal_filter_sse4_1, 8)));
 #endif  // HAVE_SSE4_1
+#if HAVE_NEON
+INSTANTIATE_TEST_SUITE_P(NEON, YUVTemporalFilterTest,
+                         ::testing::Values(TemporalFilterWithBd(
+                             &vp9_apply_temporal_filter_neon, 8)));
+#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace
diff --git a/test/yuv_video_source.h b/test/yuv_video_source.h
index 51948c0ef..bb5eec5bb 100644
--- a/test/yuv_video_source.h
+++ b/test/yuv_video_source.h
@@ -35,12 +35,12 @@ class YUVVideoSource : public VideoSource {
     SetSize(width, height, format);
   }
 
-  virtual ~YUVVideoSource() {
+  ~YUVVideoSource() override {
     vpx_img_free(img_);
     if (input_file_) fclose(input_file_);
   }
 
-  virtual void Begin() {
+  void Begin() override {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
     ASSERT_NE(input_file_, nullptr)
@@ -53,28 +53,28 @@ class YUVVideoSource : public VideoSource {
     FillFrame();
   }
 
-  virtual void Next() {
+  void Next() override {
     ++frame_;
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const {
+  vpx_image_t *img() const override {
     return (frame_ < limit_) ? img_ : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
-  virtual vpx_codec_pts_t pts() const { return frame_; }
+  vpx_codec_pts_t pts() const override { return frame_; }
 
-  virtual unsigned long duration() const { return 1; }
+  unsigned long duration() const override { return 1; }
 
-  virtual vpx_rational_t timebase() const {
+  vpx_rational_t timebase() const override {
     const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ };
     return t;
   }
 
-  virtual unsigned int frame() const { return frame_; }
+  unsigned int frame() const override { return frame_; }
 
-  virtual unsigned int limit() const { return limit_; }
+  unsigned int limit() const override { return limit_; }
 
   virtual void SetSize(unsigned int width, unsigned int height,
                        vpx_img_fmt format) {
diff --git a/third_party/libwebm/AUTHORS.TXT b/third_party/libwebm/AUTHORS.TXT
index 9686ac13e..59b648ca6 100644
--- a/third_party/libwebm/AUTHORS.TXT
+++ b/third_party/libwebm/AUTHORS.TXT
@@ -2,3 +2,4 @@
 # Name or Organization <email address>
 
 Google Inc.
+Elijah Cirioli <eli.cirioli@gmail.com>
diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk
index 23f935f2d..b02795cca 100644
--- a/third_party/libwebm/Android.mk
+++ b/third_party/libwebm/Android.mk
@@ -1,3 +1,5 @@
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 LOCAL_PATH:= $(call my-dir)
 
 include $(CLEAR_VARS)
@@ -18,3 +20,4 @@ LOCAL_LICENSE_KINDS := SPDX-license-identifier-BSD
 LOCAL_LICENSE_CONDITIONS := notice
 LOCAL_NOTICE_FILE := $(LOCAL_PATH)/LICENSE.TXT $(LOCAL_PATH)/PATENTS.TXT
 include $(BUILD_STATIC_LIBRARY)
+endif  # NDK_ROOT
diff --git a/third_party/libwebm/README.libvpx b/third_party/libwebm/README.libvpx
index 325604cc6..a79b982ef 100644
--- a/third_party/libwebm/README.libvpx
+++ b/third_party/libwebm/README.libvpx
@@ -1,7 +1,7 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: ee0bab576c338c9807249b99588e352b7268cb62
+Version: 1930e3ca23b007f3ff11d98a570077be6201957e
 License: BSD
-License File: LICENSE.txt
+License File: LICENSE.TXT
 
 Description:
 libwebm is used to handle WebM container I/O.
@@ -18,3 +18,4 @@ Only keep:
  - mkvmuxer/
  - mkvparser/
  - PATENTS.TXT
+- use -std=gnu++11 in Android.mk (https://crbug.com/webm/1708)
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index ae3653143..faaf0165f 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -607,10 +607,10 @@ bool ContentEncoding::Write(IMkvWriter* writer) const {
   return true;
 }
 
-uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size,
+uint64_t ContentEncoding::EncodingSize(uint64_t compression_size,
                                        uint64_t encryption_size) const {
   // TODO(fgalligan): Add support for compression settings.
-  if (compresion_size != 0)
+  if (compression_size != 0)
     return 0;
 
   uint64_t encoding_size = 0;
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxer.h b/third_party/libwebm/mkvmuxer/mkvmuxer.h
index f2db37714..8602d8232 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxer.h
+++ b/third_party/libwebm/mkvmuxer/mkvmuxer.h
@@ -330,7 +330,7 @@ class ContentEncoding {
 
  private:
   // Returns the size in bytes for the encoding elements.
-  uint64_t EncodingSize(uint64_t compresion_size,
+  uint64_t EncodingSize(uint64_t compression_size,
                         uint64_t encryption_size) const;
 
   // Returns the size in bytes for the encryption elements.
@@ -1425,7 +1425,7 @@ class SeekHead {
   bool Write(IMkvWriter* writer);
 
   // We are going to put a cap on the number of Seek Entries.
-  const static int32_t kSeekEntryCount = 5;
+  constexpr static int32_t kSeekEntryCount = 5;
 
  private:
   // Returns the maximum size in bytes of one seek entry.
@@ -1505,8 +1505,8 @@ class Segment {
     kBeforeClusters = 0x1  // Position Cues before Clusters
   };
 
-  static const uint32_t kDefaultDocTypeVersion = 4;
-  static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
+  static constexpr uint32_t kDefaultDocTypeVersion = 4;
+  static constexpr uint64_t kDefaultMaxClusterDuration = 30000000000ULL;
 
   Segment();
   ~Segment();
diff --git a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index bd2f76913..300b15579 100644
--- a/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -607,7 +607,7 @@ uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) {
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
   *minor = 3;
-  *build = 0;
+  *build = 1;
   *revision = 0;
 }
 
diff --git a/third_party/libwebm/mkvparser/mkvparser.cc b/third_party/libwebm/mkvparser/mkvparser.cc
index de8884b38..868afcb3e 100644
--- a/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/third_party/libwebm/mkvparser/mkvparser.cc
@@ -55,7 +55,7 @@ Type* SafeArrayAlloc(unsigned long long num_elements,
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
   minor = 1;
-  build = 0;
+  build = 1;
   revision = 0;
 }
 
@@ -298,7 +298,7 @@ long UnserializeInt(IMkvReader* pReader, long long pos, long long size,
   if (status < 0)
     return status;
 
-  unsigned long long result = first_byte;
+  unsigned long long result = static_cast<unsigned long long>(first_byte);
   ++pos;
 
   for (long i = 1; i < size; ++i) {
@@ -2432,7 +2432,7 @@ bool CuePoint::TrackPosition::Parse(IMkvReader* pReader, long long start_,
     pos += size;  // consume payload
   }
 
-  if ((m_pos < 0) || (m_track <= 0)) {
+  if ((m_pos < 0) || (m_track <= 0) || (m_block < 0) || (m_block > LONG_MAX)) {
     return false;
   }
 
diff --git a/tools_common.c b/tools_common.c
index cbecfbb41..5c1378151 100644
--- a/tools_common.c
+++ b/tools_common.c
@@ -24,6 +24,8 @@
 #include "vpx/vp8dx.h"
 #endif
 
+#include "vpx/vpx_codec.h"
+
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
 #include <fcntl.h>
@@ -77,8 +79,8 @@ void warn(const char *fmt, ...) { LOG_ERROR("Warning"); }
 void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
-  printf("%s: %s\n", s, vpx_codec_error(ctx));
-  if (detail) printf("    %s\n", detail);
+  fprintf(stderr, "%s: %s\n", s, vpx_codec_error(ctx));
+  if (detail) fprintf(stderr, "    %s\n", detail);
   exit(EXIT_FAILURE);
 }
 
@@ -375,7 +377,7 @@ static void highbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -411,7 +413,7 @@ static void lowbd_img_upshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -452,7 +454,7 @@ void vpx_img_truncate_16_to_8(vpx_image_t *dst, vpx_image_t *src) {
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -487,7 +489,7 @@ static void highbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I42216:
     case VPX_IMG_FMT_I44416:
     case VPX_IMG_FMT_I44016: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
@@ -521,7 +523,7 @@ static void lowbd_img_downshift(vpx_image_t *dst, vpx_image_t *src,
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I444:
     case VPX_IMG_FMT_I440: break;
-    default: fatal("Unsupported image conversion"); break;
+    default: fatal("Unsupported image conversion");
   }
   for (plane = 0; plane < 3; plane++) {
     int w = src->d_w;
diff --git a/tools_common.h b/tools_common.h
index b9cfb9cc8..e2942d04b 100644
--- a/tools_common.h
+++ b/tools_common.h
@@ -32,7 +32,12 @@ typedef int64_t FileOffset;
 #define fseeko fseeko64
 #define ftello ftello64
 typedef off64_t FileOffset;
-#elif CONFIG_OS_SUPPORT
+#elif CONFIG_OS_SUPPORT &&                                                  \
+    !(defined(__ANDROID__) && __ANDROID_API__ < 24 && !defined(__LP64__) && \
+      defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS == 64)
+/* POSIX.1 has fseeko and ftello. fseeko and ftello are not available before
+ * Android API level 24. See
+ * https://android.googlesource.com/platform/bionic/+/main/docs/32-bit-abi.md */
 #include <sys/types.h> /* NOLINT */
 typedef off_t FileOffset;
 /* Use 32-bit file operations in WebM file format when building ARM
@@ -145,9 +150,9 @@ VPX_NO_RETURN void usage_exit(void);
 int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
 
 typedef struct VpxInterface {
-  const char *const name;
-  const uint32_t fourcc;
-  vpx_codec_iface_t *(*const codec_interface)();
+  const char *name;
+  uint32_t fourcc;
+  vpx_codec_iface_t *(*codec_interface)(void);
 } VpxInterface;
 
 int get_vpx_encoder_count(void);
diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c
index 48e86d327..ee3c281f0 100644
--- a/vp8/common/arm/neon/sixtappredict_neon.c
+++ b/vp8/common/arm/neon/sixtappredict_neon.c
@@ -16,7 +16,7 @@
 #include "vpx_ports/mem.h"
 
 static const int8_t vp8_sub_pel_filters[8][8] = {
-  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positionyys are */
+  { 0, 0, 128, 0, 0, 0, 0, 0 },     /* note that 1/8 pel positions are */
   { 0, -6, 123, 12, -1, 0, 0, 0 },  /*    just as per alpha -0.5 bicubic */
   { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
   { 0, -9, 93, 50, -6, 0, 0, 0 },
@@ -781,7 +781,6 @@ void vp8_sixtap_predict8x4_neon(unsigned char *src_ptr, int src_pixels_per_line,
   vst1_u8(dst_ptr, d8u8);
   dst_ptr += dst_pitch;
   vst1_u8(dst_ptr, d9u8);
-  return;
 }
 
 void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
@@ -1250,7 +1249,6 @@ void vp8_sixtap_predict8x8_neon(unsigned char *src_ptr, int src_pixels_per_line,
     vst1_u8(dst_ptr, d9u8);
     dst_ptr += dst_pitch;
   }
-  return;
 }
 
 void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
@@ -1504,7 +1502,9 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
     src += src_pixels_per_line;
     d12u8 = vld1_u8(src);
     d13u8 = vld1_u8(src + 8);
-    d14u8 = vld1_u8(src + 16);
+    // Only 5 pixels are needed, avoid a potential out of bounds read.
+    d14u8 = vld1_u8(src + 13);
+    d14u8 = vext_u8(d14u8, d14u8, 3);
     src += src_pixels_per_line;
 
     __builtin_prefetch(src);
@@ -1726,5 +1726,4 @@ void vp8_sixtap_predict16x16_neon(unsigned char *src_ptr,
       dst += dst_pitch;
     }
   }
-  return;
 }
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 405443449..8300aad94 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -251,7 +251,7 @@ typedef struct macroblockd {
   unsigned char update_mb_segmentation_data;
 
   /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */
-  unsigned char mb_segement_abs_delta;
+  unsigned char mb_segment_abs_delta;
 
   /* Per frame flags that define which MB level features (such as quantizer or
    * loop filter level) */
diff --git a/vp8/common/loongarch/sixtap_filter_lsx.c b/vp8/common/loongarch/sixtap_filter_lsx.c
index cd7ba5474..986763341 100644
--- a/vp8/common/loongarch/sixtap_filter_lsx.c
+++ b/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -1706,21 +1706,22 @@ void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
       switch (xoffset) {
         case 0: {
           __m128i tp0;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
-          src += src_stride;
-          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
 
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
           __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 1);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 2);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          src += src_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 0);
           dst += dst_stride;
-          __lsx_vstelm_w(tp0, dst, 0, 3);
+          tp0 = __lsx_vldrepl_w(src, 0);
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+
           break;
         }
         case 2:
@@ -1865,7 +1866,7 @@ void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
 
               case 1:
                 Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
-                                      h_filter, v_filter + 1, 16);
+                                      h_filter + 1, v_filter + 1, 16);
                 break;
             }
             break;
diff --git a/vp8/common/mips/msa/sixtap_filter_msa.c b/vp8/common/mips/msa/sixtap_filter_msa.c
index b0affcff0..3a1bb7cd5 100644
--- a/vp8/common/mips/msa/sixtap_filter_msa.c
+++ b/vp8/common/mips/msa/sixtap_filter_msa.c
@@ -35,101 +35,134 @@ static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, \
                         filt_h2)                                           \
   ({                                                                       \
-    v16i8 vec0_m, vec1_m, vec2_m;                                          \
-    v8i16 hz_out_m;                                                        \
+    v16i8 _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m;                        \
+    v8i16 _6tap_out_m;                                                     \
                                                                            \
     VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2,    \
-               vec0_m, vec1_m, vec2_m);                                    \
-    hz_out_m =                                                             \
-        DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);   \
+               _6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m);                  \
+    _6tap_out_m = DPADD_SH3_SH(_6tap_vec0_m, _6tap_vec1_m, _6tap_vec2_m,   \
+                               filt_h0, filt_h1, filt_h2);                 \
                                                                            \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);                  \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                                 \
+    _6tap_out_m = __msa_srari_h(_6tap_out_m, VP8_FILTER_SHIFT);            \
+    _6tap_out_m = __msa_sat_s_h(_6tap_out_m, 7);                           \
                                                                            \
-    hz_out_m;                                                              \
+    _6tap_out_m;                                                           \
   })
 
 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
                                    mask2, filt0, filt1, filt2, out0, out1) \
   {                                                                        \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                  \
+    v16i8 _6tap_4wid_vec0_m, _6tap_4wid_vec1_m, _6tap_4wid_vec2_m,         \
+        _6tap_4wid_vec3_m, _6tap_4wid_vec4_m, _6tap_4wid_vec5_m;           \
                                                                            \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);      \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);                 \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);      \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);                \
-    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m);      \
-    DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1);                \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _6tap_4wid_vec0_m,    \
+               _6tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_6tap_4wid_vec0_m, _6tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _6tap_4wid_vec2_m,    \
+               _6tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec2_m, _6tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
+    VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, _6tap_4wid_vec4_m,    \
+               _6tap_4wid_vec5_m);                                         \
+    DPADD_SB2_SH(_6tap_4wid_vec4_m, _6tap_4wid_vec5_m, filt2, filt2, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   mask2, filt0, filt1, filt2, out0, out1,   \
-                                   out2, out3)                               \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
-    DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  {                                                                         \
+    v16i8 _6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,          \
+        _6tap_8wid_vec3_m, _6tap_8wid_vec4_m, _6tap_8wid_vec5_m,            \
+        _6tap_8wid_vec6_m, _6tap_8wid_vec7_m;                               \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,    \
+                _6tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _6tap_8wid_vec0_m,     \
+               _6tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _6tap_8wid_vec2_m,     \
+               _6tap_8wid_vec3_m);                                          \
+    VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, _6tap_8wid_vec4_m,     \
+               _6tap_8wid_vec5_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, _6tap_8wid_vec6_m,     \
+               _6tap_8wid_vec7_m);                                          \
+    DPADD_SB4_SH(_6tap_8wid_vec0_m, _6tap_8wid_vec1_m, _6tap_8wid_vec2_m,   \
+                 _6tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
+    DPADD_SB4_SH(_6tap_8wid_vec4_m, _6tap_8wid_vec5_m, _6tap_8wid_vec6_m,   \
+                 _6tap_8wid_vec7_m, filt2, filt2, filt2, filt2, out0, out1, \
+                 out2, out3);                                               \
   }
 
-#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)        \
-  ({                                                         \
-    v8i16 tmp0;                                              \
-                                                             \
-    tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);        \
-    tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \
-                                                             \
-    tmp0;                                                    \
-  })
-
-#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)   \
+#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)                 \
   ({                                                                  \
-    v16i8 vec0_m, vec1_m;                                             \
-    v8i16 hz_out_m;                                                   \
-                                                                      \
-    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \
-    hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
+    v8i16 _4tap_dpadd_tmp0;                                           \
                                                                       \
-    hz_out_m = __msa_srari_h(hz_out_m, VP8_FILTER_SHIFT);             \
-    hz_out_m = __msa_sat_s_h(hz_out_m, 7);                            \
+    _4tap_dpadd_tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0);     \
+    _4tap_dpadd_tmp0 =                                                \
+        __msa_dpadd_s_h(_4tap_dpadd_tmp0, (v16i8)vec1, (v16i8)filt1); \
                                                                       \
-    hz_out_m;                                                         \
+    _4tap_dpadd_tmp0;                                                 \
   })
 
-#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, \
-                                   filt0, filt1, out0, out1)             \
-  {                                                                      \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                \
-                                                                         \
-    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m);    \
-    DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1);               \
-    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m);    \
-    DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1);              \
+#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)        \
+  ({                                                                       \
+    v16i8 _4tap_vec0_m, _4tap_vec1_m;                                      \
+    v8i16 _4tap_out_m;                                                     \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, _4tap_vec0_m,         \
+               _4tap_vec1_m);                                              \
+    _4tap_out_m =                                                          \
+        FILT_4TAP_DPADD_S_H(_4tap_vec0_m, _4tap_vec1_m, filt_h0, filt_h1); \
+                                                                           \
+    _4tap_out_m = __msa_srari_h(_4tap_out_m, VP8_FILTER_SHIFT);            \
+    _4tap_out_m = __msa_sat_s_h(_4tap_out_m, 7);                           \
+                                                                           \
+    _4tap_out_m;                                                           \
+  })
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  {                                                                        \
+    v16i8 _4tap_4wid_vec0_m, _4tap_4wid_vec1_m, _4tap_4wid_vec2_m,         \
+        _4tap_4wid_vec3_m;                                                 \
+                                                                           \
+    VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, _4tap_4wid_vec0_m,    \
+               _4tap_4wid_vec1_m);                                         \
+    DOTP_SB2_SH(_4tap_4wid_vec0_m, _4tap_4wid_vec1_m, filt0, filt0, out0,  \
+                out1);                                                     \
+    VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, _4tap_4wid_vec2_m,    \
+               _4tap_4wid_vec3_m);                                         \
+    DPADD_SB2_SH(_4tap_4wid_vec2_m, _4tap_4wid_vec3_m, filt1, filt1, out0, \
+                 out1);                                                    \
   }
 
-#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,     \
-                                   filt0, filt1, out0, out1, out2, out3)     \
-  {                                                                          \
-    v16i8 vec0_m, vec1_m, vec2_m, vec3_m;                                    \
-                                                                             \
-    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);        \
-    DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,  \
-                out0, out1, out2, out3);                                     \
-    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);        \
-    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);        \
-    DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
-                 out0, out1, out2, out3);                                    \
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   filt0, filt1, out0, out1, out2, out3)    \
+  {                                                                         \
+    v16i8 _4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,          \
+        _4tap_8wid_vec3_m;                                                  \
+                                                                            \
+    VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DOTP_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,    \
+                _4tap_8wid_vec3_m, filt0, filt0, filt0, filt0, out0, out1,  \
+                out2, out3);                                                \
+    VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, _4tap_8wid_vec0_m,     \
+               _4tap_8wid_vec1_m);                                          \
+    VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, _4tap_8wid_vec2_m,     \
+               _4tap_8wid_vec3_m);                                          \
+    DPADD_SB4_SH(_4tap_8wid_vec0_m, _4tap_8wid_vec1_m, _4tap_8wid_vec2_m,   \
+                 _4tap_8wid_vec3_m, filt1, filt1, filt1, filt1, out0, out1, \
+                 out2, out3);                                               \
   }
 
 static void common_hz_6t_4x4_msa(uint8_t *RESTRICT src, int32_t src_stride,
diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
index 7cb3c9869..cc85b9a1f 100644
--- a/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/vp8/common/mips/msa/vp8_macros_msa.h
@@ -40,160 +40,160 @@
 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
 
 #if (__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile("lw  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile("lw  %[lw_val_m],  %[lw_psrc_m]  \n\t" \
+                                                        \
+                 : [lw_val_m] "=r"(lw_val_m)            \
+                 : [lw_psrc_m] "m"(*lw_psrc_m));        \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile("ld  %[val_m],  %[psrc_m]  \n\t"    \
-                                                     \
-                 : [val_m] "=r"(val_m)               \
-                 : [psrc_m] "m"(*psrc_m));           \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile("ld  %[ld_val_m],  %[ld_psrc_m]  \n\t" \
+                                                        \
+                 : [ld_val_m] "=r"(ld_val_m)            \
+                 : [ld_psrc_m] "m"(*ld_psrc_m));        \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_ld);                                   \
-    val1_m = LW(psrc_ld + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc);           \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m);                                    \
+    ld_val1_m = LW(ld_psrc_m + 4);                                \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
 
-#define SH(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint16_t val_m = (val);                 \
-                                                  \
-    asm volatile("sh  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SH(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);             \
+    const uint16_t sh_val_m = (val);                    \
+                                                        \
+    asm volatile("sh  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                        \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)         \
+                 : [sh_val_m] "r"(sh_val_m));           \
   }
 
-#define SW(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint32_t val_m = (val);                 \
-                                                  \
-    asm volatile("sw  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SW(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);             \
+    const uint32_t sw_val_m = (val);                    \
+                                                        \
+    asm volatile("sw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                        \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)         \
+                 : [sw_val_m] "r"(sw_val_m));           \
   }
 
-#define SD(val, pdst)                             \
-  {                                               \
-    uint8_t *pdst_m = (uint8_t *)(pdst);          \
-    const uint64_t val_m = (val);                 \
-                                                  \
-    asm volatile("sd  %[val_m],  %[pdst_m]  \n\t" \
-                                                  \
-                 : [pdst_m] "=m"(*pdst_m)         \
-                 : [val_m] "r"(val_m));           \
+#define SD(val, pdst)                                   \
+  {                                                     \
+    uint8_t *sd_pdst_m = (uint8_t *)(pdst);             \
+    const uint64_t sd_val_m = (val);                    \
+                                                        \
+    asm volatile("sd  %[sd_val_m],  %[sd_pdst_m]  \n\t" \
+                                                        \
+                 : [sd_pdst_m] "=m"(*sd_pdst_m)         \
+                 : [sd_val_m] "r"(sd_val_m));           \
   }
 #else  // !(__mips_isa_rev >= 6)
-#define LW(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint32_t val_m;                                  \
-                                                     \
-    asm volatile(                                    \
-        "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *lw_psrc_m = (const uint8_t *)(psrc); \
+    uint32_t lw_val_m;                                  \
+                                                        \
+    asm volatile(                                       \
+        "lwr %[lw_val_m], 0(%[lw_psrc_m]) \n\t"         \
+        "lwl %[lw_val_m], 3(%[lw_psrc_m]) \n\t"         \
+        : [lw_val_m] "=&r"(lw_val_m)                    \
+        : [lw_psrc_m] "r"(lw_psrc_m));                  \
+                                                        \
+    lw_val_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                     \
-  ({                                                 \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc); \
-    uint64_t val_m = 0;                              \
-                                                     \
-    asm volatile(                                    \
-        "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
-        "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
-        : [val_m] "=&r"(val_m)                       \
-        : [psrc_m] "r"(psrc_m));                     \
-                                                     \
-    val_m;                                           \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *ld_psrc_m = (const uint8_t *)(psrc); \
+    uint64_t ld_val_m = 0;                              \
+                                                        \
+    asm volatile(                                       \
+        "ldr %[ld_val_m], 0(%[ld_psrc_m]) \n\t"         \
+        "ldl %[ld_val_m], 7(%[ld_psrc_m]) \n\t"         \
+        : [ld_val_m] "=&r"(ld_val_m)                    \
+        : [ld_psrc_m] "r"(ld_psrc_m));                  \
+                                                        \
+    ld_val_m;                                           \
   })
 #else  // !(__mips == 64)
-#define LD(psrc)                                            \
-  ({                                                        \
-    const uint8_t *psrc_m1 = (const uint8_t *)(psrc);       \
-    uint32_t val0_m, val1_m;                                \
-    uint64_t val_m = 0;                                     \
-                                                            \
-    val0_m = LW(psrc_m1);                                   \
-    val1_m = LW(psrc_m1 + 4);                               \
-                                                            \
-    val_m = (uint64_t)(val1_m);                             \
-    val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
-    val_m = (uint64_t)(val_m | (uint64_t)val0_m);           \
-                                                            \
-    val_m;                                                  \
+#define LD(psrc)                                                  \
+  ({                                                              \
+    const uint8_t *ld_psrc_m1 = (const uint8_t *)(psrc);          \
+    uint32_t ld_val0_m, ld_val1_m;                                \
+    uint64_t ld_val_m = 0;                                        \
+                                                                  \
+    ld_val0_m = LW(ld_psrc_m1);                                   \
+    ld_val1_m = LW(ld_psrc_m1 + 4);                               \
+                                                                  \
+    ld_val_m = (uint64_t)(ld_val1_m);                             \
+    ld_val_m = (uint64_t)((ld_val_m << 32) & 0xFFFFFFFF00000000); \
+    ld_val_m = (uint64_t)(ld_val_m | (uint64_t)ld_val0_m);        \
+                                                                  \
+    ld_val_m;                                                     \
   })
 #endif  // (__mips == 64)
-#define SH(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint16_t val_m = (val);                  \
-                                                   \
-    asm volatile("ush  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SH(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sh_pdst_m = (uint8_t *)(pdst);              \
+    const uint16_t sh_val_m = (val);                     \
+                                                         \
+    asm volatile("ush  %[sh_val_m],  %[sh_pdst_m]  \n\t" \
+                                                         \
+                 : [sh_pdst_m] "=m"(*sh_pdst_m)          \
+                 : [sh_val_m] "r"(sh_val_m));            \
   }
 
-#define SW(val, pdst)                              \
-  {                                                \
-    uint8_t *pdst_m = (uint8_t *)(pdst);           \
-    const uint32_t val_m = (val);                  \
-                                                   \
-    asm volatile("usw  %[val_m],  %[pdst_m]  \n\t" \
-                                                   \
-                 : [pdst_m] "=m"(*pdst_m)          \
-                 : [val_m] "r"(val_m));            \
+#define SW(val, pdst)                                    \
+  {                                                      \
+    uint8_t *sw_pdst_m = (uint8_t *)(pdst);              \
+    const uint32_t sw_val_m = (val);                     \
+                                                         \
+    asm volatile("usw  %[sw_val_m],  %[sw_pdst_m]  \n\t" \
+                                                         \
+                 : [sw_pdst_m] "=m"(*sw_pdst_m)          \
+                 : [sw_val_m] "r"(sw_val_m));            \
   }
 
-#define SD(val, pdst)                                        \
-  {                                                          \
-    uint8_t *pdst_m1 = (uint8_t *)(pdst);                    \
-    uint32_t val0_m, val1_m;                                 \
-                                                             \
-    val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
-    val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
-                                                             \
-    SW(val0_m, pdst_m1);                                     \
-    SW(val1_m, pdst_m1 + 4);                                 \
+#define SD(val, pdst)                                           \
+  {                                                             \
+    uint8_t *sd_pdst_m1 = (uint8_t *)(pdst);                    \
+    uint32_t sd_val0_m, sd_val1_m;                              \
+                                                                \
+    sd_val0_m = (uint32_t)((val)&0x00000000FFFFFFFF);           \
+    sd_val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF); \
+                                                                \
+    SW(sd_val0_m, sd_pdst_m1);                                  \
+    SW(sd_val1_m, sd_pdst_m1 + 4);                              \
   }
 #endif  // (__mips_isa_rev >= 6)
 
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 05c72df3f..1b70ea5db 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -27,13 +27,6 @@ struct VP8_COMP;
 /* Create/destroy static data structures. */
 
 typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
-typedef enum {
   USAGE_LOCAL_FILE_PLAYBACK = 0x0,
   USAGE_STREAM_FROM_SERVER = 0x1,
   USAGE_CONSTRAINED_QUALITY = 0x2,
@@ -58,19 +51,19 @@ typedef enum {
 #include <assert.h>
 static INLINE void Scale2Ratio(int mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
-    case ONETWO:
+    case VP8E_ONETWO:
       *hr = 1;
       *hs = 2;
       break;
@@ -90,7 +83,14 @@ typedef struct {
   int Width;
   int Height;
   struct vpx_rational timebase;
-  unsigned int target_bandwidth; /* kilobits per second */
+  /* In either kilobits per second or bits per second, depending on which
+   * copy of oxcf this is in.
+   * - ctx->oxcf.target_bandwidth is in kilobits per second. See
+   *   set_vp8e_config().
+   * - ctx->cpi->oxcf.target_bandwidth in is bits per second. See
+   *   vp8_change_config().
+   */
+  unsigned int target_bandwidth;
 
   /* Parameter used for applying denoiser.
    * For temporal denoiser: noise_sensitivity = 0 means off,
@@ -221,6 +221,7 @@ typedef struct {
 
   /* Temporal scaling parameters */
   unsigned int number_of_layers;
+  /* kilobits per second */
   unsigned int target_bitrate[VPX_TS_MAX_PERIODICITY];
   unsigned int rate_decimator[VPX_TS_MAX_PERIODICITY];
   unsigned int periodicity;
@@ -243,11 +244,11 @@ typedef struct {
 
 void vp8_initialize();
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf);
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf);
 void vp8_remove_compressor(struct VP8_COMP **comp);
 
 void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf);
-void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf);
+void vp8_change_config(struct VP8_COMP *cpi, const VP8_CONFIG *oxcf);
 
 int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags,
                           YV12_BUFFER_CONFIG *sd, int64_t time_stamp,
@@ -273,8 +274,8 @@ int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows,
                    unsigned int threshold[4]);
 int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map,
                        unsigned int rows, unsigned int cols);
-int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 int vp8_get_quantizer(struct VP8_COMP *cpi);
 
 #ifdef __cplusplus
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 739a61284..12b474d93 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -127,12 +127,6 @@ specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
 #
 if (vpx_config("CONFIG_POSTPROC") eq "yes") {
 
-    add_proto qw/void vp8_blend_mb_inner/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_mb_outer/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
-    add_proto qw/void vp8_blend_b/, "unsigned char *y, unsigned char *u, unsigned char *v, int y_1, int u_1, int v_1, int alpha, int stride";
-
     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";
     specialize qw/vp8_filter_by_weight16x16 sse2 msa/;
 
diff --git a/vp8/common/vp8_loopfilter.c b/vp8/common/vp8_loopfilter.c
index 9c9e5f351..4576c1853 100644
--- a/vp8/common/vp8_loopfilter.c
+++ b/vp8/common/vp8_loopfilter.c
@@ -111,7 +111,7 @@ void vp8_loop_filter_frame_init(VP8_COMMON *cm, MACROBLOCKD *mbd,
 
     /* Note the baseline filter values for each segment */
     if (mbd->segmentation_enabled) {
-      if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+      if (mbd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
         lvl_seg = mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
       } else { /* Delta Value */
         lvl_seg += mbd->segment_feature_data[MB_LVL_ALT_LF][seg];
diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index 1c1566766..af9a98c1d 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c
@@ -63,7 +63,7 @@ void vp8_mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) {
   /* Decide whether to use the default or alternate baseline Q value. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
 
       /* Delta Value */
@@ -829,7 +829,7 @@ static void init_frame(VP8D_COMP *pbi) {
 
     /* reset the segment feature data to 0 with delta coding (Default state). */
     memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
-    xd->mb_segement_abs_delta = SEGMENT_DELTADATA;
+    xd->mb_segment_abs_delta = SEGMENT_DELTADATA;
 
     /* reset the mode ref deltasa for loop filter */
     memset(xd->ref_lf_deltas, 0, sizeof(xd->ref_lf_deltas));
@@ -995,7 +995,7 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
     xd->update_mb_segmentation_data = (unsigned char)vp8_read_bit(bc);
 
     if (xd->update_mb_segmentation_data) {
-      xd->mb_segement_abs_delta = (unsigned char)vp8_read_bit(bc);
+      xd->mb_segment_abs_delta = (unsigned char)vp8_read_bit(bc);
 
       memset(xd->segment_feature_data, 0, sizeof(xd->segment_feature_data));
 
@@ -1167,14 +1167,6 @@ int vp8_decode_frame(VP8D_COMP *pbi) {
   if (pbi->ec_active && xd->corrupted) pc->refresh_last_frame = 1;
 #endif
 
-  if (0) {
-    FILE *z = fopen("decodestats.stt", "a");
-    fprintf(z, "%6d F:%d,G:%d,A:%d,L:%d,Q:%d\n", pc->current_video_frame,
-            pc->frame_type, pc->refresh_golden_frame, pc->refresh_alt_ref_frame,
-            pc->refresh_last_frame, pc->base_qindex);
-    fclose(z);
-  }
-
   {
     pbi->independent_partitions = 1;
 
diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h
index a6bedc4fa..56500a850 100644
--- a/vp8/decoder/onyxd_int.h
+++ b/vp8/decoder/onyxd_int.h
@@ -135,27 +135,6 @@ int vp8_decode_frame(VP8D_COMP *pbi);
 int vp8_create_decoder_instances(struct frame_buffers *fb, VP8D_CONFIG *oxcf);
 int vp8_remove_decoder_instances(struct frame_buffers *fb);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(pbi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(pbi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index 490f62d1b..6ccb080cf 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -30,11 +30,13 @@
 #include "error_concealment.h"
 #endif
 
-#define CALLOC_ARRAY(p, n) CHECK_MEM_ERROR((p), vpx_calloc(sizeof(*(p)), (n)))
-#define CALLOC_ARRAY_ALIGNED(p, n, algn)                            \
-  do {                                                              \
-    CHECK_MEM_ERROR((p), vpx_memalign((algn), sizeof(*(p)) * (n))); \
-    memset((p), 0, (n) * sizeof(*(p)));                             \
+#define CALLOC_ARRAY(p, n) \
+  CHECK_MEM_ERROR(&pbi->common.error, (p), vpx_calloc(sizeof(*(p)), (n)))
+#define CALLOC_ARRAY_ALIGNED(p, n, algn)                       \
+  do {                                                         \
+    CHECK_MEM_ERROR(&pbi->common.error, (p),                   \
+                    vpx_memalign((algn), sizeof(*(p)) * (n))); \
+    memset((p), 0, (n) * sizeof(*(p)));                        \
   } while (0)
 
 static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
@@ -54,7 +56,7 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd,
     mbd->dst = xd->dst;
 
     mbd->segmentation_enabled = xd->segmentation_enabled;
-    mbd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    mbd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(mbd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -754,7 +756,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     uv_width = width >> 1;
 
     /* Allocate a vpx_atomic_int for each mb row. */
-    CHECK_MEM_ERROR(pbi->mt_current_mb_col,
+    CHECK_MEM_ERROR(&pc->error, pbi->mt_current_mb_col,
                     vpx_malloc(sizeof(*pbi->mt_current_mb_col) * pc->mb_rows));
     for (i = 0; i < pc->mb_rows; ++i)
       vpx_atomic_init(&pbi->mt_current_mb_col[i], 0);
@@ -762,7 +764,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for above_row buffers. */
     CALLOC_ARRAY(pbi->mt_yabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_yabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (width + (VP8BORDERINPIXELS << 1))));
       vp8_zero_array(pbi->mt_yabove_row[i], width + (VP8BORDERINPIXELS << 1));
@@ -770,7 +772,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_uabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_uabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_uabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -778,7 +780,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
 
     CALLOC_ARRAY(pbi->mt_vabove_row, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i) {
-      CHECK_MEM_ERROR(pbi->mt_vabove_row[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vabove_row[i],
                       vpx_memalign(16, sizeof(unsigned char) *
                                            (uv_width + VP8BORDERINPIXELS)));
       vp8_zero_array(pbi->mt_vabove_row[i], uv_width + VP8BORDERINPIXELS);
@@ -787,17 +789,17 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) {
     /* Allocate memory for left_col buffers. */
     CALLOC_ARRAY(pbi->mt_yleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_yleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_yleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 16, 1));
 
     CALLOC_ARRAY(pbi->mt_uleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_uleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_uleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
 
     CALLOC_ARRAY(pbi->mt_vleft_col, pc->mb_rows);
     for (i = 0; i < pc->mb_rows; ++i)
-      CHECK_MEM_ERROR(pbi->mt_vleft_col[i],
+      CHECK_MEM_ERROR(&pc->error, pbi->mt_vleft_col[i],
                       vpx_calloc(sizeof(unsigned char) * 8, 1));
   }
 }
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index 6fc60805f..950c94334 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
-#ifndef __aarch64__
+#if !VPX_ARCH_AARCH64
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
-#endif  // __arch64__
+#endif  // !VPX_ARCH_AARCH64
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *d->eob = (int8_t)vmaxvq_u16(eob0);
 #else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
@@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
   vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 190b013af..03691fc9d 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -1080,7 +1080,7 @@ void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest,
     if (xd->update_mb_segmentation_data) {
       signed char Data;
 
-      vp8_write_bit(bc, xd->mb_segement_abs_delta);
+      vp8_write_bit(bc, xd->mb_segment_abs_delta);
 
       /* For each segmentation feature (Quant and loop filter level) */
       for (i = 0; i < MB_LVL_MAX; ++i) {
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index e54d1e9f4..a666bca4d 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -135,7 +135,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
     // When adopting aggressive denoiser, the adj_val for each pixel
     // could be at most 8 (this is current max adjustment of the map).
     // In SSE code, we calculate the sum of adj_val for
-    // the columns, so the sum could be upto 128(16 rows). However,
+    // the columns, so the sum could be up to 128(16 rows). However,
     // the range of the value is -128 ~ 127 in SSE code, that's why
     // we do this change in C code.
     // We don't do this for UV denoiser, since there are only 8 rows,
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 620107500..5c973940e 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdio.h>
+#include <limits.h>
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -29,9 +31,9 @@
 #include "rdopt.h"
 #include "pickinter.h"
 #include "vp8/common/findnearmv.h"
-#include <stdio.h>
-#include <limits.h>
 #include "vp8/common/invtrans.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
 #include "bitstream.h"
@@ -123,7 +125,7 @@ static void calc_av_activity(VP8_COMP *cpi, int64_t activity_sum) {
     unsigned int tmp;
 
     /* Create a list to sort to */
-    CHECK_MEM_ERROR(sortlist,
+    CHECK_MEM_ERROR(&cpi->common.error, sortlist,
                     vpx_calloc(sizeof(unsigned int), cpi->common.MBs));
 
     /* Copy map to sort list */
@@ -750,6 +752,15 @@ void vp8_encode_frame(VP8_COMP *cpi) {
       vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei,
                                 cpi->encoding_thread_count);
 
+      if (cpi->mt_current_mb_col_size != cm->mb_rows) {
+        vpx_free(cpi->mt_current_mb_col);
+        cpi->mt_current_mb_col = NULL;
+        cpi->mt_current_mb_col_size = 0;
+        CHECK_MEM_ERROR(
+            &cpi->common.error, cpi->mt_current_mb_col,
+            vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
+        cpi->mt_current_mb_col_size = cm->mb_rows;
+      }
       for (i = 0; i < cm->mb_rows; ++i)
         vpx_atomic_store_release(&cpi->mt_current_mb_col[i], -1);
 
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index cb35f4f49..e2f8b89d4 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stddef.h>
 
 #include "onyx_int.h"
 #include "vp8/common/threading.h"
@@ -402,7 +403,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) {
     zd->subpixel_predict8x8 = xd->subpixel_predict8x8;
     zd->subpixel_predict16x16 = xd->subpixel_predict16x16;
     zd->segmentation_enabled = xd->segmentation_enabled;
-    zd->mb_segement_abs_delta = xd->mb_segement_abs_delta;
+    zd->mb_segment_abs_delta = xd->mb_segment_abs_delta;
     memcpy(zd->segment_feature_data, xd->segment_feature_data,
            sizeof(xd->segment_feature_data));
 
@@ -487,15 +488,10 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x,
 
 int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
   const VP8_COMMON *cm = &cpi->common;
-
-  vpx_atomic_init(&cpi->b_multi_threaded, 0);
-  cpi->encoding_thread_count = 0;
-  cpi->b_lpf_running = 0;
+  int th_count = 0;
 
   if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) {
-    int ithread;
-    int th_count = cpi->oxcf.multi_threaded - 1;
-    int rc = 0;
+    th_count = cpi->oxcf.multi_threaded - 1;
 
     /* don't allocate more threads than cores available */
     if (cpi->oxcf.multi_threaded > cm->processor_core_count) {
@@ -507,19 +503,24 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
     if (th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1)) {
       th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
     }
+  }
+  if (th_count == cpi->encoding_thread_count) return 0;
 
-    if (th_count == 0) return 0;
+  vp8cx_remove_encoder_threads(cpi);
+  if (th_count != 0) {
+    int ithread;
+    int rc = 0;
 
-    CHECK_MEM_ERROR(cpi->h_encoding_thread,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_encoding_thread,
                     vpx_malloc(sizeof(pthread_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_start_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_start_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->h_event_end_encoding,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->h_event_end_encoding,
                     vpx_malloc(sizeof(sem_t) * th_count));
-    CHECK_MEM_ERROR(cpi->mb_row_ei,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->mb_row_ei,
                     vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
     memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
-    CHECK_MEM_ERROR(cpi->en_thread_data,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->en_thread_data,
                     vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
 
     vpx_atomic_store_release(&cpi->b_multi_threaded, 1);
@@ -553,6 +554,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
       /* shutdown other threads */
       vpx_atomic_store_release(&cpi->b_multi_threaded, 0);
       for (--ithread; ithread >= 0; ithread--) {
+        sem_post(&cpi->h_event_start_encoding[ithread]);
+        sem_post(&cpi->h_event_end_encoding[ithread]);
         pthread_join(cpi->h_encoding_thread[ithread], 0);
         sem_destroy(&cpi->h_event_start_encoding[ithread]);
         sem_destroy(&cpi->h_event_end_encoding[ithread]);
@@ -560,10 +563,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
       /* free thread related resources */
       vpx_free(cpi->h_event_start_encoding);
+      cpi->h_event_start_encoding = NULL;
       vpx_free(cpi->h_event_end_encoding);
+      cpi->h_event_end_encoding = NULL;
       vpx_free(cpi->h_encoding_thread);
+      cpi->h_encoding_thread = NULL;
       vpx_free(cpi->mb_row_ei);
+      cpi->mb_row_ei = NULL;
       vpx_free(cpi->en_thread_data);
+      cpi->en_thread_data = NULL;
+      cpi->encoding_thread_count = 0;
 
       return -1;
     }
@@ -592,10 +601,16 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) {
 
         /* free thread related resources */
         vpx_free(cpi->h_event_start_encoding);
+        cpi->h_event_start_encoding = NULL;
         vpx_free(cpi->h_event_end_encoding);
+        cpi->h_event_end_encoding = NULL;
         vpx_free(cpi->h_encoding_thread);
+        cpi->h_encoding_thread = NULL;
         vpx_free(cpi->mb_row_ei);
+        cpi->mb_row_ei = NULL;
         vpx_free(cpi->en_thread_data);
+        cpi->en_thread_data = NULL;
+        cpi->encoding_thread_count = 0;
 
         return -2;
       }
@@ -627,13 +642,23 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) {
 
     sem_destroy(&cpi->h_event_end_lpf);
     sem_destroy(&cpi->h_event_start_lpf);
+    cpi->b_lpf_running = 0;
 
     /* free thread related resources */
+    vpx_free(cpi->mt_current_mb_col);
+    cpi->mt_current_mb_col = NULL;
+    cpi->mt_current_mb_col_size = 0;
     vpx_free(cpi->h_event_start_encoding);
+    cpi->h_event_start_encoding = NULL;
     vpx_free(cpi->h_event_end_encoding);
+    cpi->h_event_end_encoding = NULL;
     vpx_free(cpi->h_encoding_thread);
+    cpi->h_encoding_thread = NULL;
     vpx_free(cpi->mb_row_ei);
+    cpi->mb_row_ei = NULL;
     vpx_free(cpi->en_thread_data);
+    cpi->en_thread_data = NULL;
+    cpi->encoding_thread_count = 0;
   }
 }
 #endif
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 65d2681c9..4443f5e7c 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -412,7 +412,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
   int_mv ref_mv_full;
 
   int tmp_err;
-  int step_param = 3; /* Dont search over full range for first pass */
+  int step_param = 3; /* Don't search over full range for first pass */
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
   int n;
   vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
@@ -821,22 +821,6 @@ void vp8_first_pass(VP8_COMP *cpi) {
     vp8_yv12_copy_frame(lst_yv12, gld_yv12);
   }
 
-  /* use this to see what the first pass reconstruction looks like */
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    sprintf(filename, "enc%04d.yuv", (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0) {
-      recon_file = fopen(filename, "wb");
-    } else {
-      recon_file = fopen(filename, "ab");
-    }
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   cm->current_video_frame++;
 }
 extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
@@ -1038,12 +1022,6 @@ static int estimate_cq(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
   double clip_iifactor;
   int overhead_bits_per_mb;
 
-  if (0) {
-    FILE *f = fopen("epmp.stt", "a");
-    fprintf(f, "%10.2f\n", err_per_mb);
-    fclose(f);
-  }
-
   target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
                                 ? (512 * section_target_bandwitdh) / num_mbs
                                 : 512 * (section_target_bandwitdh / num_mbs);
@@ -1230,17 +1208,6 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
     Q++;
   }
 
-  if (0) {
-    FILE *f = fopen("estkf_q.stt", "a");
-    fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n",
-            cpi->common.current_video_frame, bits_per_mb_at_this_q,
-            target_norm_bits_per_mb, err_per_mb, err_correction_factor,
-            current_spend_ratio, group_iiratio, iiratio_correction_factor,
-            (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level,
-            Q);
-    fclose(f);
-  }
-
   return Q;
 }
 
@@ -1537,7 +1504,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1581,7 +1548,7 @@ static int calc_arf_boost(VP8_COMP *cpi, int offset, int f_frames, int b_frames,
     /* Calculate the baseline boost number for this frame */
     r = calc_frame_boost(cpi, &this_frame, this_frame_mv_in_out);
 
-    /* We want to discount the the flash frame itself and the recovery
+    /* We want to discount the flash frame itself and the recovery
      * frame that follows as both will have poor scores.
      */
     flash_detected =
@@ -1717,9 +1684,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
         /* Break at cpi->max_gf_interval unless almost totally static */
         (i >= cpi->max_gf_interval && (decay_accumulator < 0.995)) ||
         (
-            /* Dont break out with a very short interval */
+            /* Don't break out with a very short interval */
             (i > MIN_GF_INTERVAL) &&
-            /* Dont break out very close to a key frame */
+            /* Don't break out very close to a key frame */
             ((cpi->twopass.frames_to_key - i) >= MIN_GF_INTERVAL) &&
             ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
             (!flash_detected) &&
@@ -1765,7 +1732,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (boost_score > max_boost) boost_score = max_boost;
   }
 
-  /* Dont allow conventional gf too near the next kf */
+  /* Don't allow conventional gf too near the next kf */
   if ((cpi->twopass.frames_to_key - i) < MIN_GF_INTERVAL) {
     while (i < cpi->twopass.frames_to_key) {
       i++;
@@ -1786,9 +1753,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   alt_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost, &b_boost);
 #endif
 
-  /* Should we use the alternate refernce frame */
+  /* Should we use the alternate reference frame */
   if (allow_alt_ref && (i >= MIN_GF_INTERVAL) &&
-      /* dont use ARF very near next kf */
+      /* don't use ARF very near next kf */
       (i <= (cpi->twopass.frames_to_key - MIN_GF_INTERVAL)) &&
 #if NEW_BOOST
       ((next_frame.pcnt_inter > 0.75) || (next_frame.pcnt_second_ref > 0.5)) &&
@@ -2082,7 +2049,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       }
     }
 
-    /* Dont allow a negative value for gf_bits */
+    /* Don't allow a negative value for gf_bits */
     if (gf_bits < 0) gf_bits = 0;
 
     /* Add in minimum for a frame */
@@ -2123,7 +2090,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     if (cpi->twopass.gf_group_bits < 0) cpi->twopass.gf_group_bits = 0;
 
     /* This condition could fail if there are two kfs very close together
-     * despite (MIN_GF_INTERVAL) and would cause a devide by 0 in the
+     * despite (MIN_GF_INTERVAL) and would cause a divide by 0 in the
      * calculation of cpi->twopass.alt_extra_bits.
      */
     if (cpi->baseline_gf_interval >= 3) {
@@ -2393,7 +2360,7 @@ void vp8_second_pass(VP8_COMP *cpi) {
   }
 
   /* The last few frames of a clip almost always have to few or too many
-   * bits and for the sake of over exact rate control we dont want to make
+   * bits and for the sake of over exact rate control we don't want to make
    * radical adjustments to the allowed quantizer range just to use up a
    * few surplus bits or get beneath the target rate.
    */
@@ -2990,8 +2957,8 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
     }
 
     /* Set back to unscaled by defaults */
-    cpi->common.horiz_scale = NORMAL;
-    cpi->common.vert_scale = NORMAL;
+    cpi->common.horiz_scale = VP8E_NORMAL;
+    cpi->common.vert_scale = VP8E_NORMAL;
 
     /* Calculate Average bits per frame. */
     av_bits_per_frame = cpi->oxcf.target_bandwidth /
@@ -3011,7 +2978,7 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
       bits_per_frame =
           (double)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key);
 
-      /* Dont turn to resampling in easy sections just because they
+      /* Don't turn to resampling in easy sections just because they
        * have been assigned a small number of bits
        */
       if (bits_per_frame < av_bits_per_frame) {
@@ -3047,16 +3014,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
               (int)((projected_bits_perframe - av_bits_per_frame) *
                     cpi->twopass.frames_to_key));
 
-    if (0) {
-      FILE *f = fopen("Subsamle.stt", "a");
-      fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",
-              cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale,
-              cpi->common.vert_scale, kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-      fclose(f);
-    }
-
     /* The trigger for spatial resampling depends on the various
      * parameters such as whether we are streaming (CBR) or VBR.
      */
@@ -3120,17 +3077,6 @@ static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
          */
         kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio,
                                    (int)bits_per_frame, group_iiratio);
-
-        if (0) {
-          FILE *f = fopen("Subsamle.stt", "a");
-          fprintf(
-              f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n", kf_q,
-              cpi->common.horiz_scale, cpi->common.vert_scale,
-              kf_group_err / cpi->twopass.frames_to_key,
-              (int)(cpi->twopass.kf_group_bits / cpi->twopass.frames_to_key),
-              new_height, new_width);
-          fclose(f);
-        }
       }
     }
 
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index b92e2135e..bc150e482 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -1123,8 +1123,8 @@ int vp8_diamond_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
     }
   }
 
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
+  this_mv.as_mv.row = best_mv->as_mv.row * 8;
+  this_mv.as_mv.col = best_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
@@ -1441,8 +1441,8 @@ int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
     }
   }
 
-  this_mv.as_mv.row = ref_mv->as_mv.row << 3;
-  this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+  this_mv.as_mv.row = ref_mv->as_mv.row * 8;
+  this_mv.as_mv.col = ref_mv->as_mv.col * 8;
 
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
diff --git a/vp8/encoder/mr_dissim.c b/vp8/encoder/mr_dissim.c
index 011b62a08..b1bfb4b54 100644
--- a/vp8/encoder/mr_dissim.c
+++ b/vp8/encoder/mr_dissim.c
@@ -49,7 +49,6 @@ void vp8_cal_low_res_mb_cols(VP8_COMP *cpi) {
 
 void vp8_cal_dissimilarity(VP8_COMP *cpi) {
   VP8_COMMON *cm = &cpi->common;
-  int i;
 
   /* Note: The first row & first column in mip are outside the frame, which
    * were initialized to all 0.(ref_frame, mode, mv...)
@@ -67,6 +66,7 @@ void vp8_cal_dissimilarity(VP8_COMP *cpi) {
     store_info->frame_type = cm->frame_type;
 
     if (cm->frame_type != KEY_FRAME) {
+      int i;
       store_info->is_frame_dropped = 0;
       for (i = 1; i < MAX_REF_FRAMES; ++i)
         store_info->low_res_ref_frames[i] = cpi->current_ref_frames[i];
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 148a16cc4..4e128e3c4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -270,7 +270,7 @@ static int rescale(int val, int num, int denom) {
   return (int)(llval * llnum / llden);
 }
 
-void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
@@ -328,7 +328,7 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
@@ -442,11 +442,6 @@ static void dealloc_compressor_data(VP8_COMP *cpi) {
 
   vpx_free(cpi->mb.pip);
   cpi->mb.pip = 0;
-
-#if CONFIG_MULTITHREAD
-  vpx_free(cpi->mt_current_mb_col);
-  cpi->mt_current_mb_col = NULL;
-#endif
 }
 
 static void enable_segmentation(VP8_COMP *cpi) {
@@ -488,7 +483,7 @@ static void set_segmentation_map(VP8_COMP *cpi,
  */
 static void set_segment_data(VP8_COMP *cpi, signed char *feature_data,
                              unsigned char abs_delta) {
-  cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+  cpi->mb.e_mbd.mb_segment_abs_delta = abs_delta;
   memcpy(cpi->segment_feature_data, feature_data,
          sizeof(cpi->segment_feature_data));
 }
@@ -1169,7 +1164,8 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
 #else
     unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
 #endif
-    CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->tok,
+                    vpx_calloc(tokens, sizeof(*cpi->tok)));
   }
 
   /* Data used for real time vc mode to see if gf needs refreshing */
@@ -1178,37 +1174,39 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   /* Structures used to monitor GF usage */
   vpx_free(cpi->gf_active_flags);
   CHECK_MEM_ERROR(
-      cpi->gf_active_flags,
+      &cpi->common.error, cpi->gf_active_flags,
       vpx_calloc(sizeof(*cpi->gf_active_flags), cm->mb_rows * cm->mb_cols));
   cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
   vpx_free(cpi->mb_activity_map);
   CHECK_MEM_ERROR(
-      cpi->mb_activity_map,
+      &cpi->common.error, cpi->mb_activity_map,
       vpx_calloc(sizeof(*cpi->mb_activity_map), cm->mb_rows * cm->mb_cols));
 
   /* allocate memory for storing last frame's MVs for MV prediction. */
   vpx_free(cpi->lfmv);
-  CHECK_MEM_ERROR(cpi->lfmv, vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
-                                        sizeof(*cpi->lfmv)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->lfmv,
+      vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2), sizeof(*cpi->lfmv)));
   vpx_free(cpi->lf_ref_frame_sign_bias);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame_sign_bias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame_sign_bias,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame_sign_bias)));
   vpx_free(cpi->lf_ref_frame);
-  CHECK_MEM_ERROR(cpi->lf_ref_frame,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->lf_ref_frame,
                   vpx_calloc((cm->mb_rows + 2) * (cm->mb_cols + 2),
                              sizeof(*cpi->lf_ref_frame)));
 
   /* Create the encoder segmentation map and set all entries to 0 */
   vpx_free(cpi->segmentation_map);
   CHECK_MEM_ERROR(
-      cpi->segmentation_map,
+      &cpi->common.error, cpi->segmentation_map,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->segmentation_map)));
   cpi->cyclic_refresh_mode_index = 0;
   vpx_free(cpi->active_map);
-  CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                              sizeof(*cpi->active_map)));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->active_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->active_map)));
   memset(cpi->active_map, 1, (cm->mb_rows * cm->mb_cols));
 
 #if CONFIG_MULTITHREAD
@@ -1221,21 +1219,11 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) {
   } else {
     cpi->mt_sync_range = 16;
   }
-
-  if (cpi->oxcf.multi_threaded > 1) {
-    int i;
-
-    vpx_free(cpi->mt_current_mb_col);
-    CHECK_MEM_ERROR(cpi->mt_current_mb_col,
-                    vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
-    for (i = 0; i < cm->mb_rows; ++i)
-      vpx_atomic_init(&cpi->mt_current_mb_col[i], 0);
-  }
-
 #endif
 
   vpx_free(cpi->tplist);
-  CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->tplist,
+                  vpx_malloc(sizeof(TOKENLIST) * cm->mb_rows));
 
 #if CONFIG_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
@@ -1302,7 +1290,7 @@ void vp8_new_framerate(VP8_COMP *cpi, double framerate) {
   }
 }
 
-static void init_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+static void init_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
 
   cpi->oxcf = *oxcf;
@@ -1424,11 +1412,11 @@ void vp8_update_layer_contexts(VP8_COMP *cpi) {
   }
 }
 
-void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
+void vp8_change_config(VP8_COMP *cpi, const VP8_CONFIG *oxcf) {
   VP8_COMMON *cm = &cpi->common;
   int last_w, last_h;
   unsigned int prev_number_of_layers;
-  unsigned int raw_target_rate;
+  double raw_target_rate;
 
   if (!cpi) return;
 
@@ -1443,11 +1431,6 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
   last_h = cpi->oxcf.Height;
   prev_number_of_layers = cpi->oxcf.number_of_layers;
 
-  if (cpi->initial_width) {
-    // TODO(https://crbug.com/1486441): Allow changing thread counts; the
-    // allocation is done once in vp8_create_compressor().
-    oxcf->multi_threaded = cpi->oxcf.multi_threaded;
-  }
   cpi->oxcf = *oxcf;
 
   switch (cpi->oxcf.Mode) {
@@ -1574,10 +1557,10 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
     cpi->oxcf.maximum_buffer_size_in_ms = 240000;
   }
 
-  raw_target_rate = (unsigned int)((int64_t)cpi->oxcf.Width * cpi->oxcf.Height *
-                                   8 * 3 * cpi->framerate / 1000);
+  raw_target_rate = ((int64_t)cpi->oxcf.Width * cpi->oxcf.Height * 8 * 3 *
+                     cpi->framerate / 1000.0);
   if (cpi->oxcf.target_bandwidth > raw_target_rate)
-    cpi->oxcf.target_bandwidth = raw_target_rate;
+    cpi->oxcf.target_bandwidth = (unsigned int)raw_target_rate;
   /* Convert target bandwidth from Kbit/s to Bit/s */
   cpi->oxcf.target_bandwidth *= 1000;
 
@@ -1672,7 +1655,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
 
   cm->sharpness_level = cpi->oxcf.Sharpness;
 
-  if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL) {
+  if (cm->horiz_scale != VP8E_NORMAL || cm->vert_scale != VP8E_NORMAL) {
     int hr, hs, vr, vs;
 
     Scale2Ratio(cm->horiz_scale, &hr, &hs);
@@ -1756,7 +1739,7 @@ static void cal_mvsadcosts(int *mvsadcost[2]) {
   } while (++i <= mvfp_max);
 }
 
-struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
+struct VP8_COMP *vp8_create_compressor(const VP8_CONFIG *oxcf) {
   int i;
 
   VP8_COMP *cpi;
@@ -1778,8 +1761,9 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
 
   cpi->common.error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site),
-                                         (MAX_MVSEARCH_STEPS * 8) + 1));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->mb.ss,
+      vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
 
   vp8_create_common(&cpi->common);
 
@@ -1884,18 +1868,19 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   }
 
   if (cpi->cyclic_refresh_mode_enabled) {
-    CHECK_MEM_ERROR(cpi->cyclic_refresh_map,
+    CHECK_MEM_ERROR(&cpi->common.error, cpi->cyclic_refresh_map,
                     vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
   } else {
     cpi->cyclic_refresh_map = (signed char *)NULL;
   }
 
-  CHECK_MEM_ERROR(cpi->skin_map, vpx_calloc(cm->mb_rows * cm->mb_cols,
-                                            sizeof(cpi->skin_map[0])));
+  CHECK_MEM_ERROR(
+      &cpi->common.error, cpi->skin_map,
+      vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(cpi->skin_map[0])));
 
-  CHECK_MEM_ERROR(cpi->consec_zero_last,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last,
                   vpx_calloc(cm->mb_rows * cm->mb_cols, 1));
-  CHECK_MEM_ERROR(cpi->consec_zero_last_mvbias,
+  CHECK_MEM_ERROR(&cpi->common.error, cpi->consec_zero_last_mvbias,
                   vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
 
   /*Initialize the feed-forward activity masking.*/
@@ -1914,6 +1899,7 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) {
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
   cpi->rt_always_update_correction_factor = 0;
+  cpi->rt_drop_recode_on_overshoot = 1;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2114,7 +2100,6 @@ void vp8_remove_compressor(VP8_COMP **comp) {
       double time_encoded =
           (cpi->last_end_time_stamp_seen - cpi->first_time_stamp_ever) /
           10000000.000;
-      double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
 
       if (cpi->b_calculate_psnr) {
         if (cpi->oxcf.number_of_layers > 1) {
@@ -2143,6 +2128,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
                     total_psnr2, total_ssim);
           }
         } else {
+          double dr = (double)cpi->bytes * 8.0 / 1000.0 / time_encoded;
           double samples =
               3.0 / 2 * cpi->count * cpi->common.Width * cpi->common.Height;
           double total_psnr =
@@ -2250,7 +2236,7 @@ void vp8_remove_compressor(VP8_COMP **comp) {
 #if 0
         {
             printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
             printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
         }
 #endif
@@ -2509,15 +2495,17 @@ static int resize_key_frame(VP8_COMP *cpi) {
     if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark *
                              cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
-      cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+          (cm->horiz_scale < VP8E_ONETWO) ? cm->horiz_scale + 1 : VP8E_ONETWO;
+      cm->vert_scale =
+          (cm->vert_scale < VP8E_ONETWO) ? cm->vert_scale + 1 : VP8E_ONETWO;
     }
     /* Should we now start scaling back up */
     else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark *
                                   cpi->oxcf.optimal_buffer_level / 100)) {
       cm->horiz_scale =
-          (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
-      cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+          (cm->horiz_scale > VP8E_NORMAL) ? cm->horiz_scale - 1 : VP8E_NORMAL;
+      cm->vert_scale =
+          (cm->vert_scale > VP8E_NORMAL) ? cm->vert_scale - 1 : VP8E_NORMAL;
     }
 
     /* Get the new height and width */
@@ -2748,7 +2736,7 @@ static int decide_key_frame(VP8_COMP *cpi) {
   }
   /* in addition if the following are true and this is not a golden frame
    * then code a key frame Note that on golden frames there often seems
-   * to be a pop in intra useage anyway hence this restriction is
+   * to be a pop in intra usage anyway hence this restriction is
    * designed to prevent spurious key frames. The Intra pop needs to be
    * investigated.
    */
@@ -3196,6 +3184,113 @@ void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm) {
 
   vp8_yv12_extend_frame_borders(cm->frame_to_show);
 }
+// Return 1 if frame is to be dropped. Update frame drop decimation
+// counters.
+int vp8_check_drop_buffer(VP8_COMP *cpi) {
+  VP8_COMMON *cm = &cpi->common;
+  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
+                        cpi->oxcf.optimal_buffer_level / 100);
+  int drop_mark75 = drop_mark * 2 / 3;
+  int drop_mark50 = drop_mark / 4;
+  int drop_mark25 = drop_mark / 8;
+  if (cpi->drop_frames_allowed) {
+    /* The reset to decimation 0 is only done here for one pass.
+     * Once it is set two pass leaves decimation on till the next kf.
+     */
+    if (cpi->buffer_level > drop_mark && cpi->decimation_factor > 0) {
+      cpi->decimation_factor--;
+    }
+
+    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
+      cpi->decimation_factor = 1;
+
+    } else if (cpi->buffer_level < drop_mark25 &&
+               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
+      cpi->decimation_factor = 3;
+    } else if (cpi->buffer_level < drop_mark50 &&
+               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
+      cpi->decimation_factor = 2;
+    } else if (cpi->buffer_level < drop_mark75 &&
+               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
+      cpi->decimation_factor = 1;
+    }
+  }
+
+  /* The following decimates the frame rate according to a regular
+   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
+   * prevent buffer under-run in CBR mode. Alternatively it might be
+   * desirable in some situations to drop frame rate but throw more bits
+   * at each frame.
+   *
+   * Note that dropping a key frame can be problematic if spatial
+   * resampling is also active
+   */
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
+    switch (cpi->decimation_factor) {
+      case 1:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
+        break;
+      case 2:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+      case 3:
+        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
+        break;
+    }
+
+    /* Note that we should not throw out a key frame (especially when
+     * spatial resampling is enabled).
+     */
+    if (cm->frame_type == KEY_FRAME) {
+      cpi->decimation_count = cpi->decimation_factor;
+    } else if (cpi->decimation_count > 0) {
+      cpi->decimation_count--;
+
+      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
+        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
+      }
+
+#if CONFIG_MULTI_RES_ENCODING
+      vp8_store_drop_frame_info(cpi);
+#endif
+
+      cm->current_video_frame++;
+      cpi->frames_since_key++;
+      cpi->ext_refresh_frame_flags_pending = 0;
+      // We advance the temporal pattern for dropped frames.
+      cpi->temporal_pattern_counter++;
+
+#if CONFIG_INTERNAL_STATS
+      cpi->count++;
+#endif
+
+      cpi->buffer_level = cpi->bits_off_target;
+
+      if (cpi->oxcf.number_of_layers > 1) {
+        unsigned int i;
+
+        /* Propagate bits saved by dropping the frame to higher
+         * layers
+         */
+        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
+          LAYER_CONTEXT *lc = &cpi->layer_context[i];
+          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+          if (lc->bits_off_target > lc->maximum_buffer_size) {
+            lc->bits_off_target = lc->maximum_buffer_size;
+          }
+          lc->buffer_level = lc->bits_off_target;
+        }
+      }
+      return 1;
+    } else {
+      cpi->decimation_count = cpi->decimation_factor;
+    }
+  } else {
+    cpi->decimation_count = 0;
+  }
+  return 0;
+}
 
 static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
                                       unsigned char *dest,
@@ -3206,7 +3301,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int frame_under_shoot_limit;
 
   int Loop = 0;
-  int loop_count;
 
   VP8_COMMON *cm = &cpi->common;
   int active_worst_qchanged = 0;
@@ -3222,12 +3316,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   int undershoot_seen = 0;
 #endif
 
-  int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark *
-                        cpi->oxcf.optimal_buffer_level / 100);
-  int drop_mark75 = drop_mark * 2 / 3;
-  int drop_mark50 = drop_mark / 4;
-  int drop_mark25 = drop_mark / 8;
-
   /* Clear down mmx registers to allow floating point in what follows */
   vpx_clear_system_state();
 
@@ -3441,102 +3529,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   update_rd_ref_frame_probs(cpi);
 
-  if (cpi->drop_frames_allowed) {
-    /* The reset to decimation 0 is only done here for one pass.
-     * Once it is set two pass leaves decimation on till the next kf.
-     */
-    if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0)) {
-      cpi->decimation_factor--;
-    }
-
-    if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0) {
-      cpi->decimation_factor = 1;
-
-    } else if (cpi->buffer_level < drop_mark25 &&
-               (cpi->decimation_factor == 2 || cpi->decimation_factor == 3)) {
-      cpi->decimation_factor = 3;
-    } else if (cpi->buffer_level < drop_mark50 &&
-               (cpi->decimation_factor == 1 || cpi->decimation_factor == 2)) {
-      cpi->decimation_factor = 2;
-    } else if (cpi->buffer_level < drop_mark75 &&
-               (cpi->decimation_factor == 0 || cpi->decimation_factor == 1)) {
-      cpi->decimation_factor = 1;
-    }
-  }
-
-  /* The following decimates the frame rate according to a regular
-   * pattern (i.e. to 1/2 or 2/3 frame rate) This can be used to help
-   * prevent buffer under-run in CBR mode. Alternatively it might be
-   * desirable in some situations to drop frame rate but throw more bits
-   * at each frame.
-   *
-   * Note that dropping a key frame can be problematic if spatial
-   * resampling is also active
-   */
-  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
-    switch (cpi->decimation_factor) {
-      case 1:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
-        break;
-      case 2:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-      case 3:
-        cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 5 / 4;
-        break;
-    }
-
-    /* Note that we should not throw out a key frame (especially when
-     * spatial resampling is enabled).
-     */
-    if (cm->frame_type == KEY_FRAME) {
-      cpi->decimation_count = cpi->decimation_factor;
-    } else if (cpi->decimation_count > 0) {
-      cpi->decimation_count--;
-
-      cpi->bits_off_target += cpi->av_per_frame_bandwidth;
-      if (cpi->bits_off_target > cpi->oxcf.maximum_buffer_size) {
-        cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
-      }
-
-#if CONFIG_MULTI_RES_ENCODING
-      vp8_store_drop_frame_info(cpi);
-#endif
-
-      cm->current_video_frame++;
-      cpi->frames_since_key++;
-      cpi->ext_refresh_frame_flags_pending = 0;
-      // We advance the temporal pattern for dropped frames.
-      cpi->temporal_pattern_counter++;
-
-#if CONFIG_INTERNAL_STATS
-      cpi->count++;
-#endif
-
-      cpi->buffer_level = cpi->bits_off_target;
-
-      if (cpi->oxcf.number_of_layers > 1) {
-        unsigned int i;
-
-        /* Propagate bits saved by dropping the frame to higher
-         * layers
-         */
-        for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
-          LAYER_CONTEXT *lc = &cpi->layer_context[i];
-          lc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
-          if (lc->bits_off_target > lc->maximum_buffer_size) {
-            lc->bits_off_target = lc->maximum_buffer_size;
-          }
-          lc->buffer_level = lc->bits_off_target;
-        }
-      }
-
-      return;
-    } else {
-      cpi->decimation_count = cpi->decimation_factor;
-    }
-  } else {
-    cpi->decimation_count = 0;
+  if (vp8_check_drop_buffer(cpi)) {
+    return;
   }
 
   /* Decide how big to make the frame */
@@ -3635,7 +3629,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         Q = cpi->avg_frame_qindex;
       }
 
-      /* For constrained quality dont allow Q less than the cq level */
+      /* For constrained quality don't allow Q less than the cq level */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
           (Q < cpi->cq_target_quality)) {
         Q = cpi->cq_target_quality;
@@ -3662,7 +3656,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     } else {
       cpi->active_best_quality = inter_minq[Q];
 
-      /* For the constant/constrained quality mode we dont want
+      /* For the constant/constrained quality mode we don't want
        * q to fall below the cq level.
        */
       if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
@@ -3683,7 +3677,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
      * higher quality on the frames to prevent bits just going to waste.
      */
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
-      /* Note that the use of >= here elliminates the risk of a devide
+      /* Note that the use of >= here elliminates the risk of a divide
        * by 0 error in the else if clause
        */
       if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size) {
@@ -3772,8 +3766,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
   vp8_save_coding_context(cpi);
 
-  loop_count = 0;
-
   scale_and_extend_source(cpi->un_scaled_source, cpi);
 
 #if CONFIG_TEMPORAL_DENOISING && CONFIG_POSTPROC
@@ -3946,7 +3938,8 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
     /* transform / motion compensation build reconstruction frame */
     vp8_encode_frame(cpi);
 
-    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) {
+    if (cpi->pass == 0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER &&
+        cpi->rt_drop_recode_on_overshoot == 1) {
       if (vp8_drop_encodedframe_overshoot(cpi, Q)) {
         vpx_clear_system_state();
         return;
@@ -3996,7 +3989,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
         q_low = cpi->active_best_quality;
         q_high = cpi->active_worst_quality;
 
-        loop_count++;
         Loop = 1;
 
         continue;
@@ -4222,7 +4214,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
 
     if (Loop == 1) {
       vp8_restore_coding_context(cpi);
-      loop_count++;
 #if CONFIG_INTERNAL_STATS
       cpi->tot_recode_hits++;
 #endif
@@ -4324,12 +4315,12 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   vp8_cal_dissimilarity(cpi);
 #endif
 
-  /* Update the GF useage maps.
+  /* Update the GF usage maps.
    * This is done after completing the compression of a frame when all
    * modes etc. are finalized but before loop filter
    */
   if (cpi->oxcf.number_of_layers == 1) {
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+    vp8_update_gf_usage_maps(cpi, cm, &cpi->mb);
   }
 
   if (cm->frame_type == KEY_FRAME) cm->refresh_last_frame = 1;
@@ -4486,7 +4477,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
        * size within range) then use the last frame value - 1. The -1
        * is designed to stop Q and hence the data rate, from
        * progressively falling away during difficult sections, but at
-       * the same time reduce the number of itterations around the
+       * the same time reduce the number of iterations around the
        * recode loop.
        */
       if (Q > cpi->ni_av_qi) cpi->ni_av_qi = Q - 1;
@@ -4733,7 +4724,7 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
   cpi->mb.e_mbd.update_mb_segmentation_data = 0;
   cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
 
-  /* Dont increment frame counters if this was an altref buffer update
+  /* Don't increment frame counters if this was an altref buffer update
    * not a real frame
    */
   if (cm->show_frame) {
@@ -5111,7 +5102,7 @@ int vp8_get_compressed_data(VP8_COMP *cpi, unsigned int *frame_flags,
 
   if (cm->refresh_last_frame) memcpy(&cpi->lfc_n, &cm->fc, sizeof(cm->fc));
 
-  /* if its a dropped frame honor the requests on subsequent frames */
+  /* if it's a dropped frame honor the requests on subsequent frames */
   if (*size > 0) {
     cpi->droppable = !frame_is_reference(cpi);
 
@@ -5385,15 +5376,15 @@ int vp8_set_active_map(VP8_COMP *cpi, unsigned char *map, unsigned int rows,
   }
 }
 
-int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
-  if (horiz_mode <= ONETWO) {
+int vp8_set_internal_size(VP8_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
+  if (horiz_mode <= VP8E_ONETWO) {
     cpi->common.horiz_scale = horiz_mode;
   } else {
     return -1;
   }
 
-  if (vert_mode <= ONETWO) {
+  if (vert_mode <= VP8E_ONETWO) {
     cpi->common.vert_scale = vert_mode;
   } else {
     return -1;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 46a17913a..1451a2781 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -215,7 +215,7 @@ enum {
 typedef struct {
   /* Layer configuration */
   double framerate;
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
 
   /* Layer specific coding parameters */
   int64_t starting_buffer_level;
@@ -360,7 +360,7 @@ typedef struct VP8_COMP {
   /* GF interval chosen when we coded the last GF */
   int current_gf_interval;
 
-  /* Total bits overspent becasue of GF boost (cumulative) */
+  /* Total bits overspent because of GF boost (cumulative) */
   int gf_overspend_bits;
 
   /* Used in the few frames following a GF to recover the extra bits
@@ -438,7 +438,7 @@ typedef struct VP8_COMP {
   int kf_boost;
   int last_boost;
 
-  int target_bandwidth;
+  int target_bandwidth; /* bits per second */
   struct vpx_codec_pkt_list *output_pkt_list;
 
 #if 0
@@ -526,6 +526,7 @@ typedef struct VP8_COMP {
 #if CONFIG_MULTITHREAD
   /* multithread data */
   vpx_atomic_int *mt_current_mb_col;
+  int mt_current_mb_col_size;
   int mt_sync_range;
   vpx_atomic_int b_multi_threaded;
   int encoding_thread_count;
@@ -707,15 +708,19 @@ typedef struct VP8_COMP {
   // Always update correction factor used for rate control after each frame for
   // realtime encoding.
   int rt_always_update_correction_factor;
+
+  // Flag to indicate frame may be dropped due to large expected overshoot,
+  // and re-encoded on next frame at max_qp.
+  int rt_drop_recode_on_overshoot;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
-void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int prev_num_layers);
-void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, const VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate);
 void vp8_update_layer_contexts(VP8_COMP *cpi);
@@ -731,26 +736,8 @@ void vp8_tokenize_mb(VP8_COMP *, MACROBLOCK *, TOKENEXTRA **);
 
 void vp8_set_speed_features(VP8_COMP *cpi);
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(lval, expr)                                         \
-  do {                                                                      \
-    assert(cpi->common.error.setjmp);                                       \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(lval, expr)                               \
-  do {                                                            \
-    assert(cpi->common.error.setjmp);                             \
-    (lval) = (expr);                                              \
-    if (!(lval))                                                  \
-      vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);            \
-  } while (0)
-#endif
+int vp8_check_drop_buffer(VP8_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 04f68c324..1af8a2f9b 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -1103,7 +1103,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
 #if CONFIG_TEMPORAL_DENOISING
     if (cpi->oxcf.noise_sensitivity) {
       /* Store for later use by denoiser. */
-      // Dont' denoise with GOLDEN OR ALTREF is they are old reference
+      // Don't denoise with GOLDEN OR ALTREF is they are old reference
       // frames (greater than MAX_GF_ARF_DENOISE_RANGE frames in past).
       int skip_old_reference = ((this_ref_frame != LAST_FRAME) &&
                                 (cpi->common.current_video_frame -
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 9cd3963e2..fcd4eb04e 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -346,7 +346,8 @@ static void calc_iframe_target_size(VP8_COMP *cpi) {
     /* Minimal target size is |2* per_frame_bandwidth|. */
     if (kf_boost < 16) kf_boost = 16;
 
-    target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = ((uint64_t)(16 + kf_boost) * cpi->per_frame_bandwidth) >> 4;
+    target = VPXMIN(INT_MAX, target);
   }
 
   if (cpi->oxcf.rc_max_intra_bitrate_pct) {
@@ -388,7 +389,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
   int Boost = 0;
 
-  int gf_frame_useage = 0; /* Golden frame useage since last GF */
+  int gf_frame_usage = 0; /* Golden frame usage since last GF */
   int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                 cpi->recent_ref_frame_usage[LAST_FRAME] +
                 cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -398,12 +399,12 @@ static void calc_gf_params(VP8_COMP *cpi) {
                       (cpi->common.mb_rows * cpi->common.mb_cols);
 
   if (tot_mbs) {
-    gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                       cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                      100 / tot_mbs;
+    gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                     100 / tot_mbs;
   }
 
-  if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+  if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
   /* Not two pass */
   if (cpi->pass != 2) {
@@ -467,7 +468,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
       /* Adjust boost based upon ambient Q */
       Boost = GFQ_ADJUSTMENT;
 
-      /* Adjust based upon most recently measure intra useage */
+      /* Adjust based upon most recently measure intra usage */
       Boost = Boost *
               gf_intra_usage_adjustment[(cpi->this_frame_percent_intra < 15)
                                             ? cpi->this_frame_percent_intra
@@ -475,7 +476,7 @@ static void calc_gf_params(VP8_COMP *cpi) {
               100;
 
       /* Adjust gf boost based upon GF usage since last GF */
-      Boost = Boost * gf_adjust_table[gf_frame_useage] / 100;
+      Boost = Boost * gf_adjust_table[gf_frame_usage] / 100;
 #endif
     }
 
@@ -516,8 +517,8 @@ static void calc_gf_params(VP8_COMP *cpi) {
 
       if (cpi->last_boost >= 1500) cpi->frames_till_gf_update_due++;
 
-      if (gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due) {
-        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_useage];
+      if (gf_interval_table[gf_frame_usage] > cpi->frames_till_gf_update_due) {
+        cpi->frames_till_gf_update_due = gf_interval_table[gf_frame_usage];
       }
 
       if (cpi->frames_till_gf_update_due > cpi->max_gf_interval) {
@@ -718,7 +719,8 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
         }
 
         /* lower the target bandwidth for this frame. */
-        cpi->this_frame_target -= (cpi->this_frame_target * percent_low) / 200;
+        cpi->this_frame_target -=
+            (int)(((int64_t)cpi->this_frame_target * percent_low) / 200);
 
         /* Are we using allowing control of active_worst_allowed_q
          * according to buffer level.
@@ -895,7 +897,7 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
       int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME]
                                       : cpi->oxcf.fixed_q;
 
-      int gf_frame_useage = 0; /* Golden frame useage since last GF */
+      int gf_frame_usage = 0; /* Golden frame usage since last GF */
       int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] +
                     cpi->recent_ref_frame_usage[LAST_FRAME] +
                     cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
@@ -905,20 +907,20 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
                           (cpi->common.mb_rows * cpi->common.mb_cols);
 
       if (tot_mbs) {
-        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
-                           cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
-                          100 / tot_mbs;
+        gf_frame_usage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                          cpi->recent_ref_frame_usage[ALTREF_FRAME]) *
+                         100 / tot_mbs;
       }
 
-      if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active;
+      if (pct_gf_active > gf_frame_usage) gf_frame_usage = pct_gf_active;
 
       /* Is a fixed manual GF frequency being used */
       if (cpi->auto_gold) {
-        /* For one pass throw a GF if recent frame intra useage is
-         * low or the GF useage is high
+        /* For one pass throw a GF if recent frame intra usage is
+         * low or the GF usage is high
          */
         if ((cpi->pass == 0) &&
-            (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) {
+            (cpi->this_frame_percent_intra < 15 || gf_frame_usage >= 5)) {
           cpi->common.refresh_golden_frame = 1;
 
           /* Two pass GF descision */
@@ -933,10 +935,10 @@ static void calc_pframe_target_size(VP8_COMP *cpi) {
           if (0) {
               FILE *f;
 
-              f = fopen("gf_useaget.stt", "a");
+              f = fopen("gf_usaget.stt", "a");
               fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
                       cpi->common.current_video_frame,  cpi->gfu_boost,
-                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+                      GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_usage);
               fclose(f);
           }
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index bbddacf8f..5d539ef30 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1021,7 +1021,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
         BLOCK *c;
         BLOCKD *e;
 
-        /* Is the best so far sufficiently good that we cant justify
+        /* Is the best so far sufficiently good that we can't justify
          * doing a new motion search.
          */
         if (best_label_rd < label_mv_thresh) break;
@@ -1979,7 +1979,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         rd.distortion2 += distortion;
 
         /* If even the 'Y' rd value of split is higher than best so far
-         * then dont bother looking at UV
+         * then don't bother looking at UV
          */
         if (tmp_rd < best_mode.yrd) {
           /* Now work out UV cost and add it in */
diff --git a/vp8/encoder/segmentation.c b/vp8/encoder/segmentation.c
index dcb68119e..212725811 100644
--- a/vp8/encoder/segmentation.c
+++ b/vp8/encoder/segmentation.c
@@ -11,7 +11,7 @@
 #include "segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
+void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   int mb_row, mb_col;
 
   MODE_INFO *this_mb_mode_info = cm->mi;
@@ -19,7 +19,7 @@ void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm, MACROBLOCK *x) {
   x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
 
   if ((cm->frame_type == KEY_FRAME) || (cm->refresh_golden_frame)) {
-    /* Reset Gf useage monitors */
+    /* Reset Gf usage monitors */
     memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
   } else {
diff --git a/vp8/encoder/segmentation.h b/vp8/encoder/segmentation.h
index 4ddbdbbd2..0fecfc221 100644
--- a/vp8/encoder/segmentation.h
+++ b/vp8/encoder/segmentation.h
@@ -19,8 +19,8 @@
 extern "C" {
 #endif
 
-extern void vp8_update_gf_useage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
-                                      MACROBLOCK *x);
+extern void vp8_update_gf_usage_maps(VP8_COMP *cpi, VP8_COMMON *cm,
+                                     MACROBLOCK *x);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp8/encoder/vp8_quantize.c b/vp8/encoder/vp8_quantize.c
index 5b8955510..8e5e31824 100644
--- a/vp8/encoder/vp8_quantize.c
+++ b/vp8/encoder/vp8_quantize.c
@@ -294,7 +294,7 @@ void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip) {
   /* Select the baseline MB Q index. */
   if (xd->segmentation_enabled) {
     /* Abs Value */
-    if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA) {
+    if (xd->mb_segment_abs_delta == SEGMENT_ABSDATA) {
       QIndex = xd->segment_feature_data[MB_LVL_ALT_Q]
                                        [xd->mode_info_context->mbmi.segment_id];
       /* Delta Value */
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 340f3e663..a6f0b4cbc 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -19,6 +19,9 @@
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_timestamp.h"
+#if CONFIG_MULTITHREAD
+#include "vp8/encoder/ethreading.h"
+#endif
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
 #include "vp8/encoder/firstpass.h"
@@ -488,6 +491,9 @@ static vpx_codec_err_t vp8e_set_config(vpx_codec_alg_priv_t *ctx,
   ctx->cfg = *cfg;
   set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
   vp8_change_config(ctx->cpi, &ctx->oxcf);
+#if CONFIG_MULTITHREAD
+  if (vp8cx_create_encoder_threads(ctx->cpi)) return VPX_CODEC_ERROR;
+#endif
   ctx->cpi->common.error.setjmp = 0;
   return VPX_CODEC_OK;
 }
@@ -618,10 +624,11 @@ static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx,
 static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
                                                       va_list args) {
   VP8_COMP *cpi = ctx->cpi;
-  const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  const unsigned int data = CAST(VP8E_SET_RTC_EXTERNAL_RATECTRL, args);
   if (data) {
     cpi->cyclic_refresh_mode_enabled = 0;
     cpi->rt_always_update_correction_factor = 1;
+    cpi->rt_drop_recode_on_overshoot = 0;
   }
   return VPX_CODEC_OK;
 }
@@ -911,12 +918,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     }
   }
 
-  if (setjmp(ctx->cpi->common.error.jmp)) {
-    ctx->cpi->common.error.setjmp = 0;
-    vpx_clear_system_state();
-    return VPX_CODEC_CORRUPT_FRAME;
-  }
-
   /* Initialize the encoder instance on the first frame*/
   if (!res && ctx->cpi) {
     unsigned int lib_flags;
@@ -927,6 +928,13 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     unsigned char *cx_data_end;
     int comp_data_state = 0;
 
+    if (setjmp(ctx->cpi->common.error.jmp)) {
+      ctx->cpi->common.error.setjmp = 0;
+      vpx_clear_system_state();
+      return VPX_CODEC_CORRUPT_FRAME;
+    }
+    ctx->cpi->common.error.setjmp = 1;
+
     /* Set up internal flags */
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) {
       ((VP8_COMP *)ctx->cpi)->b_calculate_psnr = 1;
@@ -962,8 +970,6 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
     cx_data_end = ctx->cx_data + cx_data_sz;
     lib_flags = 0;
 
-    ctx->cpi->common.error.setjmp = 1;
-
     while (cx_data_sz >= ctx->cx_data_sz / 2) {
       comp_data_state = vp8_get_compressed_data(
           ctx->cpi, &lib_flags, &size, cx_data, cx_data_end, &dst_time_stamp,
@@ -1059,6 +1065,7 @@ static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t *ctx,
         }
       }
     }
+    ctx->cpi->common.error.setjmp = 0;
   }
 
   return res;
@@ -1224,8 +1231,8 @@ static vpx_codec_err_t vp8e_set_scalemode(vpx_codec_alg_priv_t *ctx,
   if (data) {
     int res;
     vpx_scaling_mode_t scalemode = *(vpx_scaling_mode_t *)data;
-    res = vp8_set_internal_size(ctx->cpi, (VPX_SCALING)scalemode.h_scaling_mode,
-                                (VPX_SCALING)scalemode.v_scaling_mode);
+    res = vp8_set_internal_size(ctx->cpi, scalemode.h_scaling_mode,
+                                scalemode.v_scaling_mode);
 
     if (!res) {
       /*force next frame a key frame to effect scaling mode */
@@ -1292,8 +1299,8 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {
         0,  /* rc_resize_allowed */
         1,  /* rc_scaled_width */
         1,  /* rc_scaled_height */
-        60, /* rc_resize_down_thresold */
-        30, /* rc_resize_up_thresold */
+        60, /* rc_resize_down_thresh */
+        30, /* rc_resize_up_thresh */
 
         VPX_VBR,     /* rc_end_usage */
         { NULL, 0 }, /* rc_twopass_stats_in */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 55a77ba7e..2e5d6dcfe 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -20,6 +20,7 @@
 #include "vpx_version.h"
 #include "common/alloccommon.h"
 #include "common/common.h"
+#include "common/onyxc_int.h"
 #include "common/onyxd.h"
 #include "decoder/onyxd_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -162,7 +163,10 @@ static vpx_codec_err_t vp8_peek_si_internal(const uint8_t *data,
       si->h = (clear[8] | (clear[9] << 8)) & 0x3fff;
 
       /*printf("w=%d, h=%d\n", si->w, si->h);*/
-      if (!(si->h && si->w)) res = VPX_CODEC_CORRUPT_FRAME;
+      if (!(si->h && si->w)) {
+        si->w = si->h = 0;
+        res = VPX_CODEC_CORRUPT_FRAME;
+      }
     } else {
       res = VPX_CODEC_UNSUP_BITSTREAM;
     }
@@ -246,14 +250,14 @@ static int update_fragments(vpx_codec_alg_priv_t *ctx, const uint8_t *data,
     /* Store a pointer to this fragment and return. We haven't
      * received the complete frame yet, so we will wait with decoding.
      */
-    ctx->fragments.ptrs[ctx->fragments.count] = data;
-    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
-    ctx->fragments.count++;
-    if (ctx->fragments.count > (1 << EIGHT_PARTITION) + 1) {
+    if (ctx->fragments.count >= MAX_PARTITIONS) {
       ctx->fragments.count = 0;
       *res = VPX_CODEC_INVALID_PARAM;
       return -1;
     }
+    ctx->fragments.ptrs[ctx->fragments.count] = data;
+    ctx->fragments.sizes[ctx->fragments.count] = data_sz;
+    ctx->fragments.count++;
     return 0;
   }
 
@@ -301,17 +305,26 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
   }
 
   if (!ctx->decoder_init && !ctx->si.is_kf) res = VPX_CODEC_UNSUP_BITSTREAM;
+  if (!res && ctx->decoder_init && w == 0 && h == 0 && ctx->si.h == 0 &&
+      ctx->si.w == 0) {
+    VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
+    assert(pbi != NULL);
+    assert(!pbi->common.error.setjmp);
+    res = VPX_CODEC_CORRUPT_FRAME;
+    vpx_internal_error(&pbi->common.error, res,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
 
   if ((ctx->si.h != h) || (ctx->si.w != w)) resolution_change = 1;
 
 #if CONFIG_MULTITHREAD
   if (!res && ctx->restart_threads) {
-    struct frame_buffers *fb = &ctx->yv12_frame_buffers;
     VP8D_COMP *pbi = ctx->yv12_frame_buffers.pbi[0];
     VP8_COMMON *const pc = &pbi->common;
     if (setjmp(pbi->common.error.jmp)) {
-      vp8_remove_decoder_instances(fb);
-      vp8_zero(fb->pbi);
+      pbi->common.error.setjmp = 0;
+      vp8_decoder_remove_threads(pbi);
       vpx_clear_system_state();
       return VPX_CODEC_ERROR;
     }
@@ -349,7 +362,14 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
     }
 
     res = vp8_create_decoder_instances(&ctx->yv12_frame_buffers, &oxcf);
-    if (res == VPX_CODEC_OK) ctx->decoder_init = 1;
+    if (res == VPX_CODEC_OK) {
+      ctx->decoder_init = 1;
+    } else {
+      /* on failure clear the cached resolution to ensure a full
+       * reallocation is attempted on resync. */
+      ctx->si.w = 0;
+      ctx->si.h = 0;
+    }
   }
 
   /* Set these even if already initialized.  The caller may have changed the
@@ -494,6 +514,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
 
     /* get ready for the next series of fragments */
     ctx->fragments.count = 0;
+    pbi->common.error.setjmp = 0;
   }
 
   return res;
diff --git a/vp8/vp8_ratectrl_rtc.cc b/vp8/vp8_ratectrl_rtc.cc
index f3f42529d..dd3c8e623 100644
--- a/vp8/vp8_ratectrl_rtc.cc
+++ b/vp8/vp8_ratectrl_rtc.cc
@@ -10,7 +10,9 @@
 
 #include <math.h>
 #include <new>
+#include "vp8/common/common.h"
 #include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/onyx_int.h"
 #include "vp8/encoder/ratectrl.h"
 #include "vpx_ports/system_state.h"
 
@@ -60,12 +62,19 @@ std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
   if (!rc_api->cpi_) return nullptr;
   vp8_zero(*rc_api->cpi_);
 
-  rc_api->InitRateControl(cfg);
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
 
   return rc_api;
 }
 
-void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+VP8RateControlRTC::~VP8RateControlRTC() {
+  if (cpi_) {
+    vpx_free(cpi_->gf_active_flags);
+    vpx_free(cpi_);
+  }
+}
+
+bool VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
   oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
@@ -83,13 +92,19 @@ void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
   cpi_->kf_bitrate_adjustment = 0;
   cpi_->gf_overspend_bits = 0;
   cpi_->non_gf_bitrate_adjustment = 0;
-  UpdateRateControl(rc_cfg);
+  if (!UpdateRateControl(rc_cfg)) return false;
   cpi_->buffer_level = oxcf->starting_buffer_level;
   cpi_->bits_off_target = oxcf->starting_buffer_level;
+  return true;
 }
 
-void VP8RateControlRTC::UpdateRateControl(
+bool VP8RateControlRTC::UpdateRateControl(
     const VP8RateControlRtcConfig &rc_cfg) {
+  if (rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS) {
+    return false;
+  }
+
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
   const unsigned int prev_number_of_layers = oxcf->number_of_layers;
@@ -118,6 +133,8 @@ void VP8RateControlRTC::UpdateRateControl(
   cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  if (oxcf->drop_frames_water_mark > 0) cpi_->drop_frames_allowed = 1;
   cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
   cpi_->framerate = rc_cfg.framerate;
   for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
@@ -190,9 +207,11 @@ void VP8RateControlRTC::UpdateRateControl(
 
   vp8_new_framerate(cpi_, cpi_->framerate);
   vpx_clear_system_state();
+  return true;
 }
 
-void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
+FrameDropDecision VP8RateControlRTC::ComputeQP(
+    const VP8FrameParamsQpRTC &frame_params) {
   VP8_COMMON *const cm = &cpi_->common;
   vpx_clear_system_state();
   if (cpi_->oxcf.number_of_layers > 1) {
@@ -203,14 +222,27 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
     vp8_restore_layer_context(cpi_, layer);
     vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
   }
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
   cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
     cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
   }
 
-  vp8_pick_frame_size(cpi_);
+  cpi_->per_frame_bandwidth = static_cast<int>(
+      round(cpi_->oxcf.target_bandwidth / cpi_->output_framerate));
+  if (vp8_check_drop_buffer(cpi_)) {
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
+
+  if (!vp8_pick_frame_size(cpi_)) {
+    cm->current_video_frame++;
+    cpi_->frames_since_key++;
+    cpi_->ext_refresh_frame_flags_pending = 0;
+    if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+    return FrameDropDecision::kDrop;
+  }
 
   if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
       cpi_->buffered_mode) {
@@ -274,10 +306,39 @@ void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
   q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
   vp8_set_quantizer(cpi_, q_);
   vpx_clear_system_state();
+  return FrameDropDecision::kOk;
 }
 
 int VP8RateControlRTC::GetQP() const { return q_; }
 
+int VP8RateControlRTC::GetLoopfilterLevel() const {
+  VP8_COMMON *cm = &cpi_->common;
+  const double qp = q_;
+
+  // This model is from linear regression
+  if (cm->Width * cm->Height <= 320 * 240) {
+    cm->filter_level = static_cast<int>(0.352685 * qp + 2.957774);
+  } else if (cm->Width * cm->Height <= 640 * 480) {
+    cm->filter_level = static_cast<int>(0.485069 * qp - 0.534462);
+  } else {
+    cm->filter_level = static_cast<int>(0.314875 * qp + 7.959003);
+  }
+
+  int min_filter_level = 0;
+  // This logic is from get_min_filter_level() in picklpf.c
+  if (q_ > 6 && q_ <= 16) {
+    min_filter_level = 1;
+  } else {
+    min_filter_level = (q_ / 8);
+  }
+
+  const int max_filter_level = 63;
+  if (cm->filter_level < min_filter_level) cm->filter_level = min_filter_level;
+  if (cm->filter_level > max_filter_level) cm->filter_level = max_filter_level;
+
+  return cm->filter_level;
+}
+
 void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
   VP8_COMMON *const cm = &cpi_->common;
   vpx_clear_system_state();
diff --git a/vp8/vp8_ratectrl_rtc.h b/vp8/vp8_ratectrl_rtc.h
index def7dd8f9..59fb60752 100644
--- a/vp8/vp8_ratectrl_rtc.h
+++ b/vp8/vp8_ratectrl_rtc.h
@@ -12,23 +12,24 @@
 #define VPX_VP8_RATECTRL_RTC_H_
 
 #include <cstdint>
+#include <cstring>
 #include <memory>
 
-#include "vp8/encoder/onyx_int.h"
-#include "vp8/common/common.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 
+struct VP8_COMP;
+
 namespace libvpx {
 struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP8RateControlRtcConfig() {
-    vp8_zero(layer_target_bitrate);
-    vp8_zero(ts_rate_decimator);
+    memset(&layer_target_bitrate, 0, sizeof(layer_target_bitrate));
+    memset(&ts_rate_decimator, 0, sizeof(ts_rate_decimator));
   }
 };
 
 struct VP8FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
   int temporal_layer_id;
 };
 
@@ -36,25 +37,25 @@ class VP8RateControlRTC {
  public:
   static std::unique_ptr<VP8RateControlRTC> Create(
       const VP8RateControlRtcConfig &cfg);
-  ~VP8RateControlRTC() {
-    if (cpi_) {
-      vpx_free(cpi_->gf_active_flags);
-      vpx_free(cpi_);
-    }
-  }
+  ~VP8RateControlRTC();
 
-  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  bool UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
-  // int GetLoopfilterLevel() const;
-  void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // GetLoopfilterLevel() needs to be called after ComputeQP() since loopfilter
+  // level is calculated from frame qp.
+  int GetLoopfilterLevel() const;
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called.
+  FrameDropDecision ComputeQP(const VP8FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);
 
  private:
   VP8RateControlRTC() {}
-  void InitRateControl(const VP8RateControlRtcConfig &cfg);
-  VP8_COMP *cpi_;
+  bool InitRateControl(const VP8RateControlRtcConfig &cfg);
+  struct VP8_COMP *cpi_;
   int q_;
 };
 
diff --git a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
index 219ff63cb..b43d7fa4f 100644
--- a/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
+++ b/vp9/common/arm/neon/vp9_highbd_iht16x16_add_neon.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/inv_txfm.h"
 
-// Use macros to make sure argument lane is passed in as an constant integer.
+// Use macros to make sure argument lane is passed in as a constant integer.
 
 #define vmull_lane_s32_dual(in, c, lane, out)                          \
   do {                                                                 \
@@ -64,9 +64,9 @@ highbd_dct_const_round_shift_low_8(const int64x2x2_t *const in) {
 
 #define highbd_iadst_half_butterfly(in, c, lane, out) \
   do {                                                \
-    int64x2x2_t t[2];                                 \
-    vmull_lane_s32_dual(in, c, lane, t);              \
-    out = highbd_dct_const_round_shift_low_8(t);      \
+    int64x2x2_t _t[2];                                \
+    vmull_lane_s32_dual(in, c, lane, _t);             \
+    out = highbd_dct_const_round_shift_low_8(_t);     \
   } while (0)
 
 #define highbd_iadst_butterfly(in0, in1, c, lane0, lane1, s0, s1) \
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d7de46cf4..aa13d8a0d 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -54,7 +54,7 @@ typedef struct {
 // decoder implementation modules critically rely on the defined entry values
 // specified herein. They should be refactored concurrently.
 
-#define NONE (-1)
+#define NO_REF_FRAME (-1)
 #define INTRA_FRAME 0
 #define LAST_FRAME 1
 #define GOLDEN_FRAME 2
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 8d2bed38e..d63bad93d 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -46,27 +46,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
 }
 
-#if CONFIG_DEBUG
-#define CHECK_MEM_ERROR(cm, lval, expr)                                     \
-  do {                                                                      \
-    assert(&(cm)->error.setjmp);                                            \
-    (lval) = (expr);                                                        \
-    if (!(lval))                                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
-                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
-                         __LINE__);                                         \
-  } while (0)
-#else
-#define CHECK_MEM_ERROR(cm, lval, expr)                     \
-  do {                                                      \
-    assert(&(cm)->error.setjmp);                            \
-    (lval) = (expr);                                        \
-    if (!(lval))                                            \
-      vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
-                         "Failed to allocate " #lval);      \
-  } while (0)
-#endif
-
 #define VP9_SYNC_CODE_0 0x49
 #define VP9_SYNC_CODE_1 0x83
 #define VP9_SYNC_CODE_2 0x42
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index bda824de3..9289fc9e1 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -381,7 +381,6 @@ void vp9_adapt_mode_probs(VP9_COMMON *cm) {
   }
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
     unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
     unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
     unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 69069042c..71be0f310 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -150,6 +150,7 @@ void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob == 1) /* DC only DCT coefficient. */
@@ -164,6 +165,7 @@ void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                        int eob) {
+  assert(((intptr_t)input) % 32 == 0);
   if (eob == 1)
     vpx_idct32x32_1_add(input, dest, stride);
   else if (eob <= 34)
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index 765cb1172..1a9d45ae7 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -932,32 +932,32 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
       break;
     default:
       for (idx_32 = 0; idx_32 < 4; mip += offset_32[idx_32], ++idx_32) {
-        const int shift_y = shift_32_y[idx_32];
-        const int shift_uv = shift_32_uv[idx_32];
+        const int shift_y_32 = shift_32_y[idx_32];
+        const int shift_uv_32 = shift_32_uv[idx_32];
         const int mi_32_col_offset = ((idx_32 & 1) << 2);
         const int mi_32_row_offset = ((idx_32 >> 1) << 2);
         if (mi_32_col_offset >= max_cols || mi_32_row_offset >= max_rows)
           continue;
         switch (mip[0]->sb_type) {
           case BLOCK_32X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             break;
           case BLOCK_32X16:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_row_offset + 2 >= max_rows) continue;
             mip2 = mip + mode_info_stride * 2;
-            build_masks(lfi_n, mip2[0], shift_y + 16, shift_uv + 4, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 16, shift_uv_32 + 4, lfm);
             break;
           case BLOCK_16X32:
-            build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+            build_masks(lfi_n, mip[0], shift_y_32, shift_uv_32, lfm);
             if (mi_32_col_offset + 2 >= max_cols) continue;
             mip2 = mip + 2;
-            build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
+            build_masks(lfi_n, mip2[0], shift_y_32 + 2, shift_uv_32 + 1, lfm);
             break;
           default:
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
-              const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
-              const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
+              const int shift_y_16 = shift_y_32 + shift_16_y[idx_16];
+              const int shift_uv_16 = shift_uv_32 + shift_16_uv[idx_16];
               const int mi_16_col_offset =
                   mi_32_col_offset + ((idx_16 & 1) << 1);
               const int mi_16_row_offset =
@@ -968,28 +968,26 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
 
               switch (mip[0]->sb_type) {
                 case BLOCK_16X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   break;
                 case BLOCK_16X8:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_row_offset + 1 >= max_rows) continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 8, lfm);
                   break;
                 case BLOCK_8X16:
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  build_masks(lfi_n, mip[0], shift_y_16, shift_uv_16, lfm);
                   if (mi_16_col_offset + 1 >= max_cols) continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y + 1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y_16 + 1, lfm);
                   break;
                 default: {
-                  const int shift_y =
-                      shift_32_y[idx_32] + shift_16_y[idx_16] + shift_8_y[0];
-                  build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                  const int shift_y_8_0 = shift_y_16 + shift_8_y[0];
+                  build_masks(lfi_n, mip[0], shift_y_8_0, shift_uv_16, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
-                    const int shift_y = shift_32_y[idx_32] +
-                                        shift_16_y[idx_16] + shift_8_y[idx_8];
+                    const int shift_y_8 = shift_y_16 + shift_8_y[idx_8];
                     const int mi_8_col_offset =
                         mi_16_col_offset + ((idx_8 & 1));
                     const int mi_8_row_offset =
@@ -998,7 +996,7 @@ void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col,
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y_8, lfm);
                   }
                   break;
                 }
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index e76d771b8..cf60fa40f 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -217,6 +217,7 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
   const int bsl = b_width_log2_lookup[bs];
   PARTITION_TYPE partition = partition_lookup[bsl][cur_bs];
   const BLOCK_SIZE subsize = get_subsize(bs, partition);
+  BLOCK_SIZE mfqe_bs, bs_tmp;
 
   if (cur_bs < BLOCK_8X8) {
     // If there are blocks smaller than 8x8, it must be on the boundary.
@@ -236,7 +237,6 @@ static void mfqe_partition(VP9_COMMON *cm, MODE_INFO *mi, BLOCK_SIZE bs,
     uv_offset = 8;
   }
   switch (partition) {
-    BLOCK_SIZE mfqe_bs, bs_tmp;
     case PARTITION_HORZ:
       if (bs == BLOCK_64X64) {
         mfqe_bs = BLOCK_64X32;
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index ff59ff504..4878dc15e 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -158,18 +158,19 @@ static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
       // Co-ordinate of containing block to pixel precision.
       const int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
       const int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
+      const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf;
+      uint8_t *buf_array[] = { ref_buf->y_buffer, ref_buf->u_buffer,
+                               ref_buf->v_buffer };
+      const int stride_array[] = { ref_buf->y_stride, ref_buf->uv_stride,
+                                   ref_buf->uv_stride };
 #if 0  // CONFIG_BETTER_HW_COMPATIBILITY
       assert(xd->mi[0]->sb_type != BLOCK_4X8 &&
              xd->mi[0]->sb_type != BLOCK_8X4);
       assert(mv_q4.row == mv.row * (1 << (1 - pd->subsampling_y)) &&
              mv_q4.col == mv.col * (1 << (1 - pd->subsampling_x)));
 #endif
-      if (plane == 0)
-        pre_buf->buf = xd->block_refs[ref]->buf->y_buffer;
-      else if (plane == 1)
-        pre_buf->buf = xd->block_refs[ref]->buf->u_buffer;
-      else
-        pre_buf->buf = xd->block_refs[ref]->buf->v_buffer;
+      pre_buf->buf = buf_array[plane];
+      pre_buf->stride = stride_array[plane];
 
       pre_buf->buf +=
           scaled_buffer_offset(x_start + x, y_start + y, pre_buf->stride, sf);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index f4bd9772c..3ecbd5417 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -23,7 +23,9 @@ struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct macroblock_plane;
+struct vp9_sad_table;
+struct ScanOrder;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -127,24 +129,21 @@ if (vpx_config("CONFIG_VP9_TEMPORAL_DENOISING") eq "yes") {
 add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
+specialize qw/vp9_block_error_fp neon avx2 sse2/;
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
 specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  specialize qw/vp9_block_error avx2 sse2/;
-
-  specialize qw/vp9_block_error_fp avx2 sse2/;
+  specialize qw/vp9_block_error neon avx2 sse2/;
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
-  specialize qw/vp9_highbd_block_error sse2/;
+  specialize qw/vp9_highbd_block_error neon sse2/;
 } else {
-  specialize qw/vp9_block_error avx2 msa sse2/;
-
-  specialize qw/vp9_block_error_fp neon avx2 sse2/;
+  specialize qw/vp9_block_error neon avx2 msa sse2/;
 }
 
 # fdct functions
@@ -174,19 +173,19 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
 #
 # Motion search
 #
-add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx neon/;
+add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, uint32_t start_mv_sad, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_sad_table *sad_fn_ptr, const struct mv *center_mv";
+specialize qw/vp9_diamond_search_sad neon/;
 
 #
 # Apply temporal filter
 #
 if (vpx_config("CONFIG_REALTIME_ONLY") ne "yes") {
 add_proto qw/void vp9_apply_temporal_filter/, "const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre, int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src, int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accumulator, uint16_t *y_count, uint32_t *u_accumulator, uint16_t *u_count, uint32_t *v_accumulator, uint16_t *v_count";
-specialize qw/vp9_apply_temporal_filter sse4_1/;
+specialize qw/vp9_apply_temporal_filter sse4_1 neon/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vp9_highbd_apply_temporal_filter/, "const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre, int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src, int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride, unsigned int block_width, unsigned int block_height, int ss_x, int ss_y, int strength, const int *const blk_fw, int use_32x32, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count";
-    specialize qw/vp9_highbd_apply_temporal_filter sse4_1/;
+    specialize qw/vp9_highbd_apply_temporal_filter sse4_1 neon/;
   }
 }
 
@@ -195,10 +194,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
@@ -206,8 +205,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht8x8 neon/;
 
   add_proto qw/void vp9_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht16x16 neon/;
 
   add_proto qw/void vp9_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
 
diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index 8bea61dea..adacb7ef9 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c
@@ -688,14 +688,14 @@ DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
   968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
 };
 
-const scan_order vp9_default_scan_orders[TX_SIZES] = {
+const ScanOrder vp9_default_scan_orders[TX_SIZES] = {
   { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
   { default_scan_8x8, vp9_default_iscan_8x8, default_scan_8x8_neighbors },
   { default_scan_16x16, vp9_default_iscan_16x16, default_scan_16x16_neighbors },
   { default_scan_32x32, vp9_default_iscan_32x32, default_scan_32x32_neighbors },
 };
 
-const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES] = {
+const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES] = {
   { // TX_4X4
     { default_scan_4x4, vp9_default_iscan_4x4, default_scan_4x4_neighbors },
     { row_scan_4x4, vp9_row_iscan_4x4, row_scan_4x4_neighbors },
diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 72a9a5ec4..3d1dcc66d 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h
@@ -23,14 +23,14 @@ extern "C" {
 
 #define MAX_NEIGHBORS 2
 
-typedef struct {
+typedef struct ScanOrder {
   const int16_t *scan;
   const int16_t *iscan;
   const int16_t *neighbors;
-} scan_order;
+} ScanOrder;
 
-extern const scan_order vp9_default_scan_orders[TX_SIZES];
-extern const scan_order vp9_scan_orders[TX_SIZES][TX_TYPES];
+extern const ScanOrder vp9_default_scan_orders[TX_SIZES];
+extern const ScanOrder vp9_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -39,8 +39,8 @@ static INLINE int get_coef_context(const int16_t *neighbors,
          1;
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
+static INLINE const ScanOrder *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                        PLANE_TYPE type, int block_idx) {
   const MODE_INFO *const mi = xd->mi[0];
 
   if (is_inter_block(mi) || type != PLANE_TYPE_Y || xd->lossless) {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index b3d50162b..8df18af3b 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -283,7 +283,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, lf_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->mutex,
                     vpx_malloc(sizeof(*lf_sync->mutex) * rows));
     if (lf_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -291,7 +291,7 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->cond,
                     vpx_malloc(sizeof(*lf_sync->cond) * rows));
     if (lf_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -299,23 +299,21 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->lf_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->lf_mutex,
                     vpx_malloc(sizeof(*lf_sync->lf_mutex)));
     pthread_mutex_init(lf_sync->lf_mutex, NULL);
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_mutex,
                     vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
     if (lf_sync->recon_done_mutex) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
       }
     }
 
-    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+    CHECK_MEM_ERROR(&cm->error, lf_sync->recon_done_cond,
                     vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
     if (lf_sync->recon_done_cond) {
-      int i;
       for (i = 0; i < rows; ++i) {
         pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
       }
@@ -323,15 +321,15 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, lf_sync->lfdata,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->lfdata,
                   vpx_malloc(num_workers * sizeof(*lf_sync->lfdata)));
   lf_sync->num_workers = num_workers;
   lf_sync->num_active_workers = lf_sync->num_workers;
 
-  CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
-  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+  CHECK_MEM_ERROR(&cm->error, lf_sync->num_tiles_done,
                   vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
                                  mi_cols_aligned_to_sb(cm->mi_rows) >>
                              MI_BLOCK_SIZE_LOG2));
@@ -390,10 +388,10 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
 
 static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
   int return_val = -1;
-  int cur_row;
   const int max_rows = cm->mi_rows;
 
 #if CONFIG_MULTITHREAD
+  int cur_row;
   const int tile_cols = 1 << cm->log2_tile_cols;
 
   pthread_mutex_lock(lf_sync->lf_mutex);
@@ -430,14 +428,8 @@ static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
 #else
   (void)lf_sync;
   if (cm->lf_row < max_rows) {
-    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
     return_val = cm->lf_row;
     cm->lf_row += MI_BLOCK_SIZE;
-    if (cm->lf_row < max_rows) {
-      /* If this is not the last row, make sure the next row is also decoded.
-       * This is because the intra predict has to happen before loop filter */
-      cur_row += 1;
-    }
   }
 #endif  // CONFIG_MULTITHREAD
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 2a27e6fdb..c5892156f 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -323,9 +323,9 @@ static void predict_and_reconstruct_intra_block(TileWorkerData *twd,
   if (!mi->skip) {
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                             mi->segment_id);
     if (eob > 0) {
@@ -348,9 +348,9 @@ static void parse_intra_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
     struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_TYPE tx_type =
         (plane || xd->lossless) ? DCT_DCT : intra_mode_to_tx_type_lookup[mode];
-    const scan_order *sc = (plane || xd->lossless)
-                               ? &vp9_default_scan_orders[tx_size]
-                               : &vp9_scan_orders[tx_size][tx_type];
+    const ScanOrder *sc = (plane || xd->lossless)
+                              ? &vp9_default_scan_orders[tx_size]
+                              : &vp9_scan_orders[tx_size][tx_type];
     *pd->eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                        mi->segment_id);
     /* Keep the alignment to 16 */
@@ -393,7 +393,7 @@ static int reconstruct_inter_block(TileWorkerData *twd, MODE_INFO *const mi,
                                    int mi_row, int mi_col) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
   uint8_t *dst = &pd->dst.buf[4 * row * pd->dst.stride + 4 * col];
@@ -423,7 +423,7 @@ static int parse_inter_block_row_mt(TileWorkerData *twd, MODE_INFO *const mi,
                                     TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *sc = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *sc = &vp9_default_scan_orders[tx_size];
   const int eob = vp9_decode_block_tokens(twd, plane, sc, col, row, tx_size,
                                           mi->segment_id);
 
@@ -1469,7 +1469,7 @@ static void resize_mv_buffer(VP9_COMMON *cm) {
   vpx_free(cm->cur_frame->mvs);
   cm->cur_frame->mi_rows = cm->mi_rows;
   cm->cur_frame->mi_cols = cm->mi_cols;
-  CHECK_MEM_ERROR(cm, cm->cur_frame->mvs,
+  CHECK_MEM_ERROR(&cm->error, cm->cur_frame->mvs,
                   (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
                                        sizeof(*cm->cur_frame->mvs)));
 }
@@ -1776,7 +1776,8 @@ static void vp9_jobq_alloc(VP9Decoder *pbi) {
 
   if (jobq_size > row_mt_worker_data->jobq_size) {
     vpx_free(row_mt_worker_data->jobq_buf);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->jobq_buf, vpx_calloc(1, jobq_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->jobq_buf,
+                    vpx_calloc(1, jobq_size));
     vp9_jobq_init(&row_mt_worker_data->jobq, row_mt_worker_data->jobq_buf,
                   jobq_size);
     row_mt_worker_data->jobq_size = jobq_size;
@@ -1923,7 +1924,7 @@ static int row_decode_worker_hook(void *arg1, void *arg2) {
       const int is_last_row = sb_rows - 1 == cur_sb_row;
       int mi_col_start, mi_col_end;
       if (!tile_data_recon)
-        CHECK_MEM_ERROR(cm, tile_data_recon,
+        CHECK_MEM_ERROR(&cm->error, tile_data_recon,
                         vpx_memalign(32, sizeof(TileWorkerData)));
 
       tile_data_recon->xd = pbi->mb;
@@ -2025,7 +2026,7 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi, const uint8_t *data,
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
-    CHECK_MEM_ERROR(cm, pbi->lf_worker.data1,
+    CHECK_MEM_ERROR(&cm->error, pbi->lf_worker.data1,
                     vpx_memalign(32, sizeof(LFWorkerData)));
     pbi->lf_worker.hook = vp9_loop_filter_worker;
     if (pbi->max_threads > 1 && !winterface->reset(&pbi->lf_worker)) {
@@ -2192,8 +2193,6 @@ static int tile_worker_hook(void *arg1, void *arg2) {
 
   volatile int mi_row = 0;
   volatile int n = tile_data->buf_start;
-  tile_data->error_info.setjmp = 1;
-
   if (setjmp(tile_data->error_info.jmp)) {
     tile_data->error_info.setjmp = 0;
     tile_data->xd.corrupted = 1;
@@ -2206,6 +2205,7 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
     return 0;
   }
+  tile_data->error_info.setjmp = 1;
 
   tile_data->xd.corrupted = 0;
 
@@ -2285,7 +2285,7 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads;
-    CHECK_MEM_ERROR(cm, pbi->tile_workers,
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     for (n = 0; n < num_threads; ++n) {
       VPxWorker *const worker = &pbi->tile_workers[n];
@@ -2293,6 +2293,11 @@ static INLINE void init_mt(VP9Decoder *pbi) {
 
       winterface->init(worker);
       if (n < num_threads - 1 && !winterface->reset(worker)) {
+        do {
+          winterface->end(&pbi->tile_workers[pbi->num_tile_workers - 1]);
+        } while (--pbi->num_tile_workers != 0);
+        vpx_free(pbi->tile_workers);
+        pbi->tile_workers = NULL;
         vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                            "Tile decoder thread creation failed");
       }
@@ -2824,7 +2829,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
     const int num_jobs = sb_rows << cm->log2_tile_cols;
 
     if (pbi->row_mt_worker_data == NULL) {
-      CHECK_MEM_ERROR(cm, pbi->row_mt_worker_data,
+      CHECK_MEM_ERROR(&cm->error, pbi->row_mt_worker_data,
                       vpx_calloc(1, sizeof(*pbi->row_mt_worker_data)));
 #if CONFIG_MULTITHREAD
       pthread_mutex_init(&pbi->row_mt_worker_data->recon_done_mutex, NULL);
@@ -3006,7 +3011,8 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
     // platforms without DECLARE_ALIGNED().
     assert((sizeof(*pbi->tile_worker_data) % 16) == 0);
     vpx_free(pbi->tile_worker_data);
-    CHECK_MEM_ERROR(cm, pbi->tile_worker_data, vpx_memalign(32, twd_size));
+    CHECK_MEM_ERROR(&cm->error, pbi->tile_worker_data,
+                    vpx_memalign(32, twd_size));
     pbi->total_tiles = tile_rows * tile_cols;
   }
 
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index db3e74663..0989cde58 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -204,7 +204,7 @@ static void read_intra_frame_mode_info(VP9_COMMON *const cm,
   mi->skip = read_skip(cm, xd, mi->segment_id, r);
   mi->tx_size = read_tx_size(cm, xd, 1, r);
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   switch (bsize) {
     case BLOCK_4X4:
@@ -299,7 +299,7 @@ static REFERENCE_MODE read_block_reference_mode(VP9_COMMON *cm,
   }
 }
 
-// Read the referncence frame
+// Read the reference frame
 static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
                             vpx_reader *r, int segment_id,
                             MV_REFERENCE_FRAME ref_frame[2]) {
@@ -309,7 +309,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     ref_frame[0] = (MV_REFERENCE_FRAME)get_segdata(&cm->seg, segment_id,
                                                    SEG_LVL_REF_FRAME);
-    ref_frame[1] = NONE;
+    ref_frame[1] = NO_REF_FRAME;
   } else {
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
@@ -333,7 +333,7 @@ static void read_ref_frames(VP9_COMMON *const cm, MACROBLOCKD *const xd,
         ref_frame[0] = LAST_FRAME;
       }
 
-      ref_frame[1] = NONE;
+      ref_frame[1] = NO_REF_FRAME;
     } else {
       assert(0 && "Invalid prediction mode.");
     }
@@ -383,7 +383,7 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm,
   mi->interp_filter = SWITCHABLE_FILTERS;
 
   mi->ref_frame[0] = INTRA_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -708,7 +708,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
     mi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
       vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
-                         "Invalid usage of segement feature on small blocks");
+                         "Invalid usage of segment feature on small blocks");
       return;
     }
   } else {
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 7db8ed72d..5a7e9f9ab 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -66,7 +66,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
   {
     int i;
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_mutex,
+        &cm->error, row_mt_worker_data->recon_sync_mutex,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_mutex) * num_jobs));
     if (row_mt_worker_data->recon_sync_mutex) {
       for (i = 0; i < num_jobs; ++i) {
@@ -75,7 +75,7 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
     }
 
     CHECK_MEM_ERROR(
-        cm, row_mt_worker_data->recon_sync_cond,
+        &cm->error, row_mt_worker_data->recon_sync_cond,
         vpx_malloc(sizeof(*row_mt_worker_data->recon_sync_cond) * num_jobs));
     if (row_mt_worker_data->recon_sync_cond) {
       for (i = 0; i < num_jobs; ++i) {
@@ -86,24 +86,24 @@ void vp9_dec_alloc_row_mt_mem(RowMTWorkerData *row_mt_worker_data,
 #endif
   row_mt_worker_data->num_sbs = num_sbs;
   for (plane = 0; plane < 3; ++plane) {
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->dqcoeff[plane],
-                    vpx_memalign(16, dqcoeff_size));
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->dqcoeff[plane],
+                    vpx_memalign(32, dqcoeff_size));
     memset(row_mt_worker_data->dqcoeff[plane], 0, dqcoeff_size);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->eob[plane],
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->eob[plane],
                     vpx_calloc(num_sbs << EOBS_PER_SB_LOG2,
                                sizeof(*row_mt_worker_data->eob[plane])));
   }
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->partition,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->partition,
                   vpx_calloc(num_sbs * PARTITIONS_PER_SB,
                              sizeof(*row_mt_worker_data->partition)));
-  CHECK_MEM_ERROR(cm, row_mt_worker_data->recon_map,
+  CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->recon_map,
                   vpx_calloc(num_sbs, sizeof(*row_mt_worker_data->recon_map)));
 
   // allocate memory for thread_data
   if (row_mt_worker_data->thread_data == NULL) {
     const size_t thread_size =
         max_threads * sizeof(*row_mt_worker_data->thread_data);
-    CHECK_MEM_ERROR(cm, row_mt_worker_data->thread_data,
+    CHECK_MEM_ERROR(&cm->error, row_mt_worker_data->thread_data,
                     vpx_memalign(32, thread_size));
   }
 }
@@ -181,9 +181,10 @@ VP9Decoder *vp9_decoder_create(BufferPool *const pool) {
 
   cm->error.setjmp = 1;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   pbi->need_resync = 1;
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index b0ef83c73..2e198d552 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -54,7 +54,7 @@ typedef struct TileWorkerData {
   VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+  DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]);
   DECLARE_ALIGNED(16, uint16_t, extend_and_predict_buf[80 * 2 * 80 * 2]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3ed1bd6ff..d957dc34e 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -272,9 +272,8 @@ static void get_ctx_shift(MACROBLOCKD *xd, int *ctx_shift_a, int *ctx_shift_l,
   }
 }
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id) {
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id) {
   vpx_reader *r = &twd->bit_reader;
   MACROBLOCKD *xd = &twd->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index a32052fff..a8e47021b 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h
@@ -19,9 +19,8 @@
 extern "C" {
 #endif
 
-int vp9_decode_block_tokens(TileWorkerData *twd, int plane,
-                            const scan_order *sc, int x, int y, TX_SIZE tx_size,
-                            int seg_id);
+int vp9_decode_block_tokens(TileWorkerData *twd, int plane, const ScanOrder *sc,
+                            int x, int y, TX_SIZE tx_size, int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
index 5961be5f3..997b5477e 100644
--- a/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -20,6 +20,7 @@
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/fdct4x4_neon.h"
 #include "vpx_dsp/arm/fdct8x8_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
@@ -1228,4 +1229,945 @@ void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
   }
 }
 
+static INLINE void highbd_load_buffer_8x8(const int16_t *input,
+                                          int32x4_t *lo /*[8]*/,
+                                          int32x4_t *hi /*[8]*/, int stride) {
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+  lo[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  hi[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  lo[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  hi[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  lo[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  hi[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  lo[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  hi[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  lo[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  hi[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  lo[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  hi[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  lo[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  hi[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  lo[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  hi[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void highbd_right_shift_8x8(int32x4_t *lo, int32x4_t *hi,
+                                          const int bit) {
+  int32x4_t sign_lo[8], sign_hi[8];
+  sign_lo[0] = vshrq_n_s32(lo[0], 31);
+  sign_hi[0] = vshrq_n_s32(hi[0], 31);
+  sign_lo[1] = vshrq_n_s32(lo[1], 31);
+  sign_hi[1] = vshrq_n_s32(hi[1], 31);
+  sign_lo[2] = vshrq_n_s32(lo[2], 31);
+  sign_hi[2] = vshrq_n_s32(hi[2], 31);
+  sign_lo[3] = vshrq_n_s32(lo[3], 31);
+  sign_hi[3] = vshrq_n_s32(hi[3], 31);
+  sign_lo[4] = vshrq_n_s32(lo[4], 31);
+  sign_hi[4] = vshrq_n_s32(hi[4], 31);
+  sign_lo[5] = vshrq_n_s32(lo[5], 31);
+  sign_hi[5] = vshrq_n_s32(hi[5], 31);
+  sign_lo[6] = vshrq_n_s32(lo[6], 31);
+  sign_hi[6] = vshrq_n_s32(hi[6], 31);
+  sign_lo[7] = vshrq_n_s32(lo[7], 31);
+  sign_hi[7] = vshrq_n_s32(hi[7], 31);
+
+  if (bit == 2) {
+    const int32x4_t const_rounding = vdupq_n_s32(1);
+    lo[0] = vaddq_s32(lo[0], const_rounding);
+    hi[0] = vaddq_s32(hi[0], const_rounding);
+    lo[1] = vaddq_s32(lo[1], const_rounding);
+    hi[1] = vaddq_s32(hi[1], const_rounding);
+    lo[2] = vaddq_s32(lo[2], const_rounding);
+    hi[2] = vaddq_s32(hi[2], const_rounding);
+    lo[3] = vaddq_s32(lo[3], const_rounding);
+    hi[3] = vaddq_s32(hi[3], const_rounding);
+    lo[4] = vaddq_s32(lo[4], const_rounding);
+    hi[4] = vaddq_s32(hi[4], const_rounding);
+    lo[5] = vaddq_s32(lo[5], const_rounding);
+    hi[5] = vaddq_s32(hi[5], const_rounding);
+    lo[6] = vaddq_s32(lo[6], const_rounding);
+    hi[6] = vaddq_s32(hi[6], const_rounding);
+    lo[7] = vaddq_s32(lo[7], const_rounding);
+    hi[7] = vaddq_s32(hi[7], const_rounding);
+  }
+
+  lo[0] = vsubq_s32(lo[0], sign_lo[0]);
+  hi[0] = vsubq_s32(hi[0], sign_hi[0]);
+  lo[1] = vsubq_s32(lo[1], sign_lo[1]);
+  hi[1] = vsubq_s32(hi[1], sign_hi[1]);
+  lo[2] = vsubq_s32(lo[2], sign_lo[2]);
+  hi[2] = vsubq_s32(hi[2], sign_hi[2]);
+  lo[3] = vsubq_s32(lo[3], sign_lo[3]);
+  hi[3] = vsubq_s32(hi[3], sign_hi[3]);
+  lo[4] = vsubq_s32(lo[4], sign_lo[4]);
+  hi[4] = vsubq_s32(hi[4], sign_hi[4]);
+  lo[5] = vsubq_s32(lo[5], sign_lo[5]);
+  hi[5] = vsubq_s32(hi[5], sign_hi[5]);
+  lo[6] = vsubq_s32(lo[6], sign_lo[6]);
+  hi[6] = vsubq_s32(hi[6], sign_hi[6]);
+  lo[7] = vsubq_s32(lo[7], sign_lo[7]);
+  hi[7] = vsubq_s32(hi[7], sign_hi[7]);
+
+  if (bit == 1) {
+    lo[0] = vshrq_n_s32(lo[0], 1);
+    hi[0] = vshrq_n_s32(hi[0], 1);
+    lo[1] = vshrq_n_s32(lo[1], 1);
+    hi[1] = vshrq_n_s32(hi[1], 1);
+    lo[2] = vshrq_n_s32(lo[2], 1);
+    hi[2] = vshrq_n_s32(hi[2], 1);
+    lo[3] = vshrq_n_s32(lo[3], 1);
+    hi[3] = vshrq_n_s32(hi[3], 1);
+    lo[4] = vshrq_n_s32(lo[4], 1);
+    hi[4] = vshrq_n_s32(hi[4], 1);
+    lo[5] = vshrq_n_s32(lo[5], 1);
+    hi[5] = vshrq_n_s32(hi[5], 1);
+    lo[6] = vshrq_n_s32(lo[6], 1);
+    hi[6] = vshrq_n_s32(hi[6], 1);
+    lo[7] = vshrq_n_s32(lo[7], 1);
+    hi[7] = vshrq_n_s32(hi[7], 1);
+  } else {
+    lo[0] = vshrq_n_s32(lo[0], 2);
+    hi[0] = vshrq_n_s32(hi[0], 2);
+    lo[1] = vshrq_n_s32(lo[1], 2);
+    hi[1] = vshrq_n_s32(hi[1], 2);
+    lo[2] = vshrq_n_s32(lo[2], 2);
+    hi[2] = vshrq_n_s32(hi[2], 2);
+    lo[3] = vshrq_n_s32(lo[3], 2);
+    hi[3] = vshrq_n_s32(hi[3], 2);
+    lo[4] = vshrq_n_s32(lo[4], 2);
+    hi[4] = vshrq_n_s32(hi[4], 2);
+    lo[5] = vshrq_n_s32(lo[5], 2);
+    hi[5] = vshrq_n_s32(hi[5], 2);
+    lo[6] = vshrq_n_s32(lo[6], 2);
+    hi[6] = vshrq_n_s32(hi[6], 2);
+    lo[7] = vshrq_n_s32(lo[7], 2);
+    hi[7] = vshrq_n_s32(hi[7], 2);
+  }
+}
+
+static INLINE void highbd_write_buffer_8x8(tran_low_t *output, int32x4_t *lo,
+                                           int32x4_t *hi, int stride) {
+  vst1q_s32(output + 0 * stride, lo[0]);
+  vst1q_s32(output + 0 * stride + 4, hi[0]);
+  vst1q_s32(output + 1 * stride, lo[1]);
+  vst1q_s32(output + 1 * stride + 4, hi[1]);
+  vst1q_s32(output + 2 * stride, lo[2]);
+  vst1q_s32(output + 2 * stride + 4, hi[2]);
+  vst1q_s32(output + 3 * stride, lo[3]);
+  vst1q_s32(output + 3 * stride + 4, hi[3]);
+  vst1q_s32(output + 4 * stride, lo[4]);
+  vst1q_s32(output + 4 * stride + 4, hi[4]);
+  vst1q_s32(output + 5 * stride, lo[5]);
+  vst1q_s32(output + 5 * stride + 4, hi[5]);
+  vst1q_s32(output + 6 * stride, lo[6]);
+  vst1q_s32(output + 6 * stride + 4, hi[6]);
+  vst1q_s32(output + 7 * stride, lo[7]);
+  vst1q_s32(output + 7 * stride + 4, hi[7]);
+}
+
+static INLINE void highbd_fadst8x8_neon(int32x4_t *lo /*[8]*/,
+                                        int32x4_t *hi /*[8]*/) {
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+  int32x4_t x_lo[8], x_hi[8];
+  int64x2_t s64_lo[16], s64_hi[16];
+
+  x_lo[0] = lo[7];
+  x_hi[0] = hi[7];
+  x_lo[1] = lo[0];
+  x_hi[1] = hi[0];
+  x_lo[2] = lo[5];
+  x_hi[2] = hi[5];
+  x_lo[3] = lo[2];
+  x_hi[3] = hi[2];
+  x_lo[4] = lo[3];
+  x_hi[4] = hi[3];
+  x_lo[5] = lo[4];
+  x_hi[5] = hi[4];
+  x_lo[6] = lo[1];
+  x_hi[6] = hi[1];
+  x_lo[7] = lo[6];
+  x_hi[7] = hi[6];
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_2_64, cospi_30_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_10_64, cospi_22_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_18_64, cospi_14_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_26_64, cospi_6_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+  t_lo[4] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 4]);
+  t_hi[4] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 4]);
+  t_lo[5] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 5]);
+  t_hi[5] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 5]);
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 6]);
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 7]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[6], t_hi[6], t_lo[7], t_hi[7], -cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+
+  // fdct_round_shift
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+
+  // stage 3
+  // s2 = cospi_16_64 * (x2 + x3)
+  // s3 = cospi_16_64 * (x2 - x3)
+  butterfly_one_coeff_s32_fast(t_lo[2], t_hi[2], t_lo[3], t_hi[3], cospi_16_64,
+                               &s_lo[2], &s_hi[2], &s_lo[3], &s_hi[3]);
+
+  // s6 = cospi_16_64 * (x6 + x7)
+  // s7 = cospi_16_64 * (x6 - x7)
+  butterfly_one_coeff_s32_fast(t_lo[6], t_hi[6], t_lo[7], t_hi[7], cospi_16_64,
+                               &s_lo[6], &s_hi[6], &s_lo[7], &s_hi[7]);
+
+  // x0, x2, x4, x6 pass through
+  lo[0] = t_lo[0];
+  hi[0] = t_hi[0];
+  lo[2] = s_lo[6];
+  hi[2] = s_hi[6];
+  lo[4] = s_lo[3];
+  hi[4] = s_hi[3];
+  lo[6] = t_lo[5];
+  hi[6] = t_hi[5];
+
+  lo[1] = vnegq_s32(t_lo[4]);
+  hi[1] = vnegq_s32(t_hi[4]);
+  lo[3] = vnegq_s32(s_lo[2]);
+  hi[3] = vnegq_s32(s_hi[2]);
+  lo[5] = vnegq_s32(s_lo[7]);
+  hi[5] = vnegq_s32(s_hi[7]);
+  lo[7] = vnegq_s32(t_lo[1]);
+  hi[7] = vnegq_s32(t_hi[1]);
+
+  transpose_s32_8x8_2(lo, hi, lo, hi);
+}
+
+void vp9_highbd_fht8x8_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t lo[8], hi[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      // pass1 variant is not precise enough
+      vpx_highbd_fdct8x8_pass2_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_8x8(input, lo, hi, stride);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_fadst8x8_neon(lo, hi);
+      highbd_right_shift_8x8(lo, hi, 1);
+      highbd_write_buffer_8x8(output, lo, hi, 8);
+      break;
+  }
+}
+
+static INLINE void highbd_load_buffer_16x16(
+    const int16_t *input, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // load first 8 columns
+  highbd_load_buffer_8x8(input, left1, right1, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  highbd_load_buffer_8x8(input, left2, right2, stride);
+  highbd_load_buffer_8x8(input + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_write_buffer_16x16(
+    tran_low_t *output, int32x4_t *left1 /*[16]*/, int32x4_t *right1 /*[16]*/,
+    int32x4_t *left2 /*[16]*/, int32x4_t *right2 /*[16]*/, int stride) {
+  // write first 8 columns
+  highbd_write_buffer_8x8(output, left1, right1, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left1 + 8, right1 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  highbd_write_buffer_8x8(output, left2, right2, stride);
+  highbd_write_buffer_8x8(output + 8 * stride, left2 + 8, right2 + 8, stride);
+}
+
+static INLINE void highbd_right_shift_16x16(int32x4_t *left1 /*[16]*/,
+                                            int32x4_t *right1 /*[16]*/,
+                                            int32x4_t *left2 /*[16]*/,
+                                            int32x4_t *right2 /*[16]*/,
+                                            const int bit) {
+  // perform rounding operations
+  highbd_right_shift_8x8(left1, right1, bit);
+  highbd_right_shift_8x8(left1 + 8, right1 + 8, bit);
+  highbd_right_shift_8x8(left2, right2, bit);
+  highbd_right_shift_8x8(left2 + 8, right2 + 8, bit);
+}
+
+static void highbd_fdct16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D DCT for 8 columns
+  int32x4_t s1_lo[8], s1_hi[8], s2_lo[8], s2_hi[8], s3_lo[8], s3_hi[8];
+  int32x4_t left8[8], right8[8];
+
+  // stage 1
+  left8[0] = vaddq_s32(left[0], left[15]);
+  right8[0] = vaddq_s32(right[0], right[15]);
+  left8[1] = vaddq_s32(left[1], left[14]);
+  right8[1] = vaddq_s32(right[1], right[14]);
+  left8[2] = vaddq_s32(left[2], left[13]);
+  right8[2] = vaddq_s32(right[2], right[13]);
+  left8[3] = vaddq_s32(left[3], left[12]);
+  right8[3] = vaddq_s32(right[3], right[12]);
+  left8[4] = vaddq_s32(left[4], left[11]);
+  right8[4] = vaddq_s32(right[4], right[11]);
+  left8[5] = vaddq_s32(left[5], left[10]);
+  right8[5] = vaddq_s32(right[5], right[10]);
+  left8[6] = vaddq_s32(left[6], left[9]);
+  right8[6] = vaddq_s32(right[6], right[9]);
+  left8[7] = vaddq_s32(left[7], left[8]);
+  right8[7] = vaddq_s32(right[7], right[8]);
+
+  // step 1
+  s1_lo[0] = vsubq_s32(left[7], left[8]);
+  s1_hi[0] = vsubq_s32(right[7], right[8]);
+  s1_lo[1] = vsubq_s32(left[6], left[9]);
+  s1_hi[1] = vsubq_s32(right[6], right[9]);
+  s1_lo[2] = vsubq_s32(left[5], left[10]);
+  s1_hi[2] = vsubq_s32(right[5], right[10]);
+  s1_lo[3] = vsubq_s32(left[4], left[11]);
+  s1_hi[3] = vsubq_s32(right[4], right[11]);
+  s1_lo[4] = vsubq_s32(left[3], left[12]);
+  s1_hi[4] = vsubq_s32(right[3], right[12]);
+  s1_lo[5] = vsubq_s32(left[2], left[13]);
+  s1_hi[5] = vsubq_s32(right[2], right[13]);
+  s1_lo[6] = vsubq_s32(left[1], left[14]);
+  s1_hi[6] = vsubq_s32(right[1], right[14]);
+  s1_lo[7] = vsubq_s32(left[0], left[15]);
+  s1_hi[7] = vsubq_s32(right[0], right[15]);
+
+  // pass1 variant is not accurate enough
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left8, right8);
+
+  // step 2
+  // step2[2] = (step1[5] - step1[2]) * cospi_16_64;
+  // step2[5] = (step1[5] + step1[2]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_16_64, &s2_lo[5], &s2_hi[5],
+                                     &s2_lo[2], &s2_hi[2]);
+  // step2[3] = (step1[4] - step1[3]) * cospi_16_64;
+  // step2[4] = (step1[4] + step1[3]) * cospi_16_64;
+  butterfly_one_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_16_64, &s2_lo[4], &s2_hi[4],
+                                     &s2_lo[3], &s2_hi[3]);
+
+  // step 3
+  s3_lo[0] = vaddq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[0] = vaddq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[1] = vaddq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[1] = vaddq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[2] = vsubq_s32(s1_lo[1], s2_lo[2]);
+  s3_hi[2] = vsubq_s32(s1_hi[1], s2_hi[2]);
+  s3_lo[3] = vsubq_s32(s1_lo[0], s2_lo[3]);
+  s3_hi[3] = vsubq_s32(s1_hi[0], s2_hi[3]);
+  s3_lo[4] = vsubq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[4] = vsubq_s32(s1_hi[7], s2_hi[4]);
+  s3_lo[5] = vsubq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[5] = vsubq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[6] = vaddq_s32(s1_lo[6], s2_lo[5]);
+  s3_hi[6] = vaddq_s32(s1_hi[6], s2_hi[5]);
+  s3_lo[7] = vaddq_s32(s1_lo[7], s2_lo[4]);
+  s3_hi[7] = vaddq_s32(s1_hi[7], s2_hi[4]);
+
+  // step 4
+  // s2[1] = cospi_24_64 * s3[6] - cospi_8_64 * s3[1]
+  // s2[6] = cospi_8_64 * s3[6]  + cospi_24_64 * s3[1]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[6], s3_hi[6], s3_lo[1], s3_hi[1],
+                                     cospi_8_64, cospi_24_64, &s2_lo[6],
+                                     &s2_hi[6], &s2_lo[1], &s2_hi[1]);
+
+  // s2[5] =  cospi_8_64 * s3[2] - cospi_24_64 * s3[5]
+  // s2[2] = cospi_24_64 * s3[2] + cospi_8_64 * s3[5]
+  butterfly_two_coeff_s32_s64_narrow(s3_lo[2], s3_hi[2], s3_lo[5], s3_hi[5],
+                                     cospi_24_64, cospi_8_64, &s2_lo[2],
+                                     &s2_hi[2], &s2_lo[5], &s2_hi[5]);
+
+  // step 5
+  s1_lo[0] = vaddq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[0] = vaddq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[1] = vsubq_s32(s3_lo[0], s2_lo[1]);
+  s1_hi[1] = vsubq_s32(s3_hi[0], s2_hi[1]);
+  s1_lo[2] = vaddq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[2] = vaddq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[3] = vsubq_s32(s3_lo[3], s2_lo[2]);
+  s1_hi[3] = vsubq_s32(s3_hi[3], s2_hi[2]);
+  s1_lo[4] = vsubq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[4] = vsubq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[5] = vaddq_s32(s3_lo[4], s2_lo[5]);
+  s1_hi[5] = vaddq_s32(s3_hi[4], s2_hi[5]);
+  s1_lo[6] = vsubq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[6] = vsubq_s32(s3_hi[7], s2_hi[6]);
+  s1_lo[7] = vaddq_s32(s3_lo[7], s2_lo[6]);
+  s1_hi[7] = vaddq_s32(s3_hi[7], s2_hi[6]);
+
+  // step 6
+  // out[1]  = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // out[15] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[7], s1_hi[7], s1_lo[0], s1_hi[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+
+  // out[9] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // out[7] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[6], s1_hi[6], s1_lo[1], s1_hi[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+
+  // out[5]  = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // out[11] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[5], s1_hi[5], s1_lo[2], s1_hi[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+
+  // out[13] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // out[3]  = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s32_s64_narrow(s1_lo[4], s1_hi[4], s1_lo[3], s1_hi[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+
+  left[0] = left8[0];
+  right[0] = right8[0];
+  left[2] = left8[1];
+  right[2] = right8[1];
+  left[4] = left8[2];
+  right[4] = right8[2];
+  left[6] = left8[3];
+  right[6] = right8[3];
+  left[8] = left8[4];
+  right[8] = right8[4];
+  left[10] = left8[5];
+  right[10] = right8[5];
+  left[12] = left8[6];
+  right[12] = right8[6];
+  left[14] = left8[7];
+  right[14] = right8[7];
+}
+
+static void highbd_fadst16_8col(int32x4_t *left, int32x4_t *right) {
+  // perform 16x16 1-D ADST for 8 columns
+  int32x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  int64x2_t s64_lo[32], s64_hi[32];
+
+  x_lo[0] = left[15];
+  x_hi[0] = right[15];
+  x_lo[1] = left[0];
+  x_hi[1] = right[0];
+  x_lo[2] = left[13];
+  x_hi[2] = right[13];
+  x_lo[3] = left[2];
+  x_hi[3] = right[2];
+  x_lo[4] = left[11];
+  x_hi[4] = right[11];
+  x_lo[5] = left[4];
+  x_hi[5] = right[4];
+  x_lo[6] = left[9];
+  x_hi[6] = right[9];
+  x_lo[7] = left[6];
+  x_hi[7] = right[6];
+  x_lo[8] = left[7];
+  x_hi[8] = right[7];
+  x_lo[9] = left[8];
+  x_hi[9] = right[8];
+  x_lo[10] = left[5];
+  x_hi[10] = right[5];
+  x_lo[11] = left[10];
+  x_hi[11] = right[10];
+  x_lo[12] = left[3];
+  x_hi[12] = right[3];
+  x_lo[13] = left[12];
+  x_hi[13] = right[12];
+  x_lo[14] = left[1];
+  x_hi[14] = right[1];
+  x_lo[15] = left[14];
+  x_hi[15] = right[14];
+
+  // stage 1, indices are doubled
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[0], x_hi[0], x_lo[1], x_hi[1], cospi_1_64, cospi_31_64,
+      &s64_lo[2 * 0], &s64_hi[2 * 0], &s64_lo[2 * 1], &s64_hi[2 * 1]);
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[2], x_hi[2], x_lo[3], x_hi[3], cospi_5_64, cospi_27_64,
+      &s64_lo[2 * 2], &s64_hi[2 * 2], &s64_lo[2 * 3], &s64_hi[2 * 3]);
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[4], x_hi[4], x_lo[5], x_hi[5], cospi_9_64, cospi_23_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[6], x_hi[6], x_lo[7], x_hi[7], cospi_13_64, cospi_19_64,
+      &s64_lo[2 * 6], &s64_hi[2 * 6], &s64_lo[2 * 7], &s64_hi[2 * 7]);
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[8], x_hi[8], x_lo[9], x_hi[9], cospi_17_64, cospi_15_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[10], x_hi[10], x_lo[11], x_hi[11], cospi_21_64, cospi_11_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[12], x_hi[12], x_lo[13], x_hi[13], cospi_25_64, cospi_7_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  butterfly_two_coeff_s32_s64_noround(
+      x_lo[14], x_hi[14], x_lo[15], x_hi[15], cospi_29_64, cospi_3_64,
+      &s64_lo[2 * 14], &s64_hi[2 * 14], &s64_lo[2 * 15], &s64_hi[2 * 15]);
+
+  // fdct_round_shift, indices are doubled
+  t_lo[0] = add_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[0] = add_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[1] = add_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[1] = add_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[2] = add_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[2] = add_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[3] = add_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[3] = add_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[6] = add_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[6] = add_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[7] = add_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[7] = add_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+  t_lo[8] = sub_s64_round_narrow(&s64_lo[2 * 0], &s64_lo[2 * 8]);
+  t_hi[8] = sub_s64_round_narrow(&s64_hi[2 * 0], &s64_hi[2 * 8]);
+  t_lo[9] = sub_s64_round_narrow(&s64_lo[2 * 1], &s64_lo[2 * 9]);
+  t_hi[9] = sub_s64_round_narrow(&s64_hi[2 * 1], &s64_hi[2 * 9]);
+  t_lo[10] = sub_s64_round_narrow(&s64_lo[2 * 2], &s64_lo[2 * 10]);
+  t_hi[10] = sub_s64_round_narrow(&s64_hi[2 * 2], &s64_hi[2 * 10]);
+  t_lo[11] = sub_s64_round_narrow(&s64_lo[2 * 3], &s64_lo[2 * 11]);
+  t_hi[11] = sub_s64_round_narrow(&s64_hi[2 * 3], &s64_hi[2 * 11]);
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 12]);
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 13]);
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 6], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 6], &s64_hi[2 * 14]);
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 7], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 7], &s64_hi[2 * 15]);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[8], t_hi[8], t_lo[9], t_hi[9], cospi_4_64, cospi_28_64,
+      &s64_lo[2 * 8], &s64_hi[2 * 8], &s64_lo[2 * 9], &s64_hi[2 * 9]);
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[10], t_hi[10], t_lo[11], t_hi[11], cospi_20_64, cospi_12_64,
+      &s64_lo[2 * 10], &s64_hi[2 * 10], &s64_lo[2 * 11], &s64_hi[2 * 11]);
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[13], t_hi[13], t_lo[12], t_hi[12], cospi_28_64, cospi_4_64,
+      &s64_lo[2 * 13], &s64_hi[2 * 13], &s64_lo[2 * 12], &s64_hi[2 * 12]);
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_12_64, cospi_20_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s4
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = add_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[2] = add_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = add_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[3] = add_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // s0 - s4
+  t_lo[4] = sub_s32_s64_narrow(s_lo[0], s_lo[4]);
+  t_hi[4] = sub_s32_s64_narrow(s_hi[0], s_hi[4]);
+  // s1 - s5
+  t_lo[5] = sub_s32_s64_narrow(s_lo[1], s_lo[5]);
+  t_hi[5] = sub_s32_s64_narrow(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = sub_s32_s64_narrow(s_lo[2], s_lo[6]);
+  t_hi[6] = sub_s32_s64_narrow(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = sub_s32_s64_narrow(s_lo[3], s_lo[7]);
+  t_hi[7] = sub_s32_s64_narrow(s_hi[3], s_hi[7]);
+
+  // fdct_round_shift()
+  // s8 + s12
+  t_lo[8] = add_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[8] = add_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 + s13
+  t_lo[9] = add_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[9] = add_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 + s14
+  t_lo[10] = add_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[10] = add_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 + s15
+  t_lo[11] = add_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[11] = add_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // s8 - s12
+  t_lo[12] = sub_s64_round_narrow(&s64_lo[2 * 8], &s64_lo[2 * 12]);
+  t_hi[12] = sub_s64_round_narrow(&s64_hi[2 * 8], &s64_hi[2 * 12]);
+  // s9 - s13
+  t_lo[13] = sub_s64_round_narrow(&s64_lo[2 * 9], &s64_lo[2 * 13]);
+  t_hi[13] = sub_s64_round_narrow(&s64_hi[2 * 9], &s64_hi[2 * 13]);
+  // s10 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 10], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 10], &s64_hi[2 * 14]);
+  // s11 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 11], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 11], &s64_hi[2 * 15]);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[4], t_hi[4], t_lo[5], t_hi[5], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 4], &s64_hi[2 * 4], &s64_lo[2 * 5], &s64_hi[2 * 5]);
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[7], t_hi[7], t_lo[6], t_hi[6], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 7], &s64_hi[2 * 7], &s64_lo[2 * 6], &s64_hi[2 * 6]);
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[12], t_hi[12], t_lo[13], t_hi[13], cospi_8_64, cospi_24_64,
+      &s64_lo[2 * 12], &s64_hi[2 * 12], &s64_lo[2 * 13], &s64_hi[2 * 13]);
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  butterfly_two_coeff_s32_s64_noround(
+      t_lo[15], t_hi[15], t_lo[14], t_hi[14], cospi_24_64, cospi_8_64,
+      &s64_lo[2 * 15], &s64_hi[2 * 15], &s64_lo[2 * 14], &s64_hi[2 * 14]);
+
+  // s0 + s2
+  t_lo[0] = add_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[0] = add_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = add_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[1] = add_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // s0 - s2
+  t_lo[2] = sub_s32_s64_narrow(s_lo[0], s_lo[2]);
+  t_hi[2] = sub_s32_s64_narrow(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = sub_s32_s64_narrow(s_lo[1], s_lo[3]);
+  t_hi[3] = sub_s32_s64_narrow(s_hi[1], s_hi[3]);
+  // fdct_round_shift()
+  // s4 + s6
+  t_lo[4] = add_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[4] = add_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 + s7
+  t_lo[5] = add_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[5] = add_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s4 - s6
+  t_lo[6] = sub_s64_round_narrow(&s64_lo[2 * 4], &s64_lo[2 * 6]);
+  t_hi[6] = sub_s64_round_narrow(&s64_hi[2 * 4], &s64_hi[2 * 6]);
+  // s5 - s7
+  t_lo[7] = sub_s64_round_narrow(&s64_lo[2 * 5], &s64_lo[2 * 7]);
+  t_hi[7] = sub_s64_round_narrow(&s64_hi[2 * 5], &s64_hi[2 * 7]);
+  // s8 + s10
+  t_lo[8] = add_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[8] = add_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = add_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[9] = add_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = sub_s32_s64_narrow(s_lo[8], s_lo[10]);
+  t_hi[10] = sub_s32_s64_narrow(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = sub_s32_s64_narrow(s_lo[9], s_lo[11]);
+  t_hi[11] = sub_s32_s64_narrow(s_hi[9], s_hi[11]);
+  // fdct_round_shift()
+  // s12 + s14
+  t_lo[12] = add_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[12] = add_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 + s15
+  t_lo[13] = add_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[13] = add_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+  // s12 - s14
+  t_lo[14] = sub_s64_round_narrow(&s64_lo[2 * 12], &s64_lo[2 * 14]);
+  t_hi[14] = sub_s64_round_narrow(&s64_hi[2 * 12], &s64_hi[2 * 14]);
+  // s13 - s15
+  t_lo[15] = sub_s64_round_narrow(&s64_lo[2 * 13], &s64_lo[2 * 15]);
+  t_hi[15] = sub_s64_round_narrow(&s64_hi[2 * 13], &s64_hi[2 * 15]);
+
+  // stage 4, with fdct_round_shift
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  // s3 = cospi_16_64 * (x2 - x3);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                     -cospi_16_64, &x_lo[2], &x_hi[2], &x_lo[3],
+                                     &x_hi[3]);
+  // s6 = cospi_16_64 * (x6 + x7);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                     cospi_16_64, &x_lo[6], &x_hi[6], &x_lo[7],
+                                     &x_hi[7]);
+  // s10 = cospi_16_64 * (x10 + x11);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                     cospi_16_64, &x_lo[10], &x_hi[10],
+                                     &x_lo[11], &x_hi[11]);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  // s15 = cospi_16_64 * (x14 - x15);
+  butterfly_one_coeff_s32_s64_narrow(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                     -cospi_16_64, &x_lo[14], &x_hi[14],
+                                     &x_lo[15], &x_hi[15]);
+
+  // Just copy x0, x1, x4, x5, x8, x9, x12, x13
+  x_lo[0] = t_lo[0];
+  x_hi[0] = t_hi[0];
+  x_lo[1] = t_lo[1];
+  x_hi[1] = t_hi[1];
+  x_lo[4] = t_lo[4];
+  x_hi[4] = t_hi[4];
+  x_lo[5] = t_lo[5];
+  x_hi[5] = t_hi[5];
+  x_lo[8] = t_lo[8];
+  x_hi[8] = t_hi[8];
+  x_lo[9] = t_lo[9];
+  x_hi[9] = t_hi[9];
+  x_lo[12] = t_lo[12];
+  x_hi[12] = t_hi[12];
+  x_lo[13] = t_lo[13];
+  x_hi[13] = t_hi[13];
+
+  left[0] = x_lo[0];
+  right[0] = x_hi[0];
+  left[1] = vnegq_s32(x_lo[8]);
+  right[1] = vnegq_s32(x_hi[8]);
+  left[2] = x_lo[12];
+  right[2] = x_hi[12];
+  left[3] = vnegq_s32(x_lo[4]);
+  right[3] = vnegq_s32(x_hi[4]);
+  left[4] = x_lo[6];
+  right[4] = x_hi[6];
+  left[5] = x_lo[14];
+  right[5] = x_hi[14];
+  left[6] = x_lo[10];
+  right[6] = x_hi[10];
+  left[7] = x_lo[2];
+  right[7] = x_hi[2];
+  left[8] = x_lo[3];
+  right[8] = x_hi[3];
+  left[9] = x_lo[11];
+  right[9] = x_hi[11];
+  left[10] = x_lo[15];
+  right[10] = x_hi[15];
+  left[11] = x_lo[7];
+  right[11] = x_hi[7];
+  left[12] = x_lo[5];
+  right[12] = x_hi[5];
+  left[13] = vnegq_s32(x_lo[13]);
+  right[13] = vnegq_s32(x_hi[13]);
+  left[14] = x_lo[9];
+  right[14] = x_hi[9];
+  left[15] = vnegq_s32(x_lo[1]);
+  right[15] = vnegq_s32(x_hi[1]);
+}
+
+static void highbd_fdct16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                  int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fdct16_8col(left1, right1);
+  // Right half.
+  highbd_fdct16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+static void highbd_fadst16x16_neon(int32x4_t *left1, int32x4_t *right1,
+                                   int32x4_t *left2, int32x4_t *right2) {
+  // Left half.
+  highbd_fadst16_8col(left1, right1);
+  // Right half.
+  highbd_fadst16_8col(left2, right2);
+  transpose_s32_16x16(left1, right1, left2, right2);
+}
+
+void vp9_highbd_fht16x16_neon(const int16_t *input, tran_low_t *output,
+                              int stride, int tx_type) {
+  int32x4_t left1[16], right1[16], left2[16], right2[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fdct16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_16x16(input, left1, right1, left2, right2, stride);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_right_shift_16x16(left1, right1, left2, right2, 2);
+      highbd_fadst16x16_neon(left1, right1, left2, right2);
+      highbd_write_buffer_16x16(output, left1, right1, left2, right2, 16);
+      break;
+  }
+}
+
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 53e8c7e49..d631cd437 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,7 +21,7 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s8(v_sum_diff_total);
 #else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 33753f77b..b82b3f9db 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -30,30 +30,6 @@ static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
   return result;
 }
 
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
-  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 /*****************************************************************************
  * This function utilizes 3 properties of the cost function lookup tables,   *
  * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
@@ -71,8 +47,9 @@ static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
  *****************************************************************************/
 int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                                 const search_site_config *cfg, MV *ref_mv,
-                                MV *best_mv, int search_param, int sad_per_bit,
-                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                                uint32_t start_mv_sad, MV *best_mv,
+                                int search_param, int sad_per_bit, int *num00,
+                                const vp9_sad_fn_ptr_t *sad_fn_ptr,
                                 const MV *center_mv) {
   static const uint32_t data[4] = { 0, 1, 2, 3 };
   const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
@@ -101,8 +78,8 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
   const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
 
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+  const int ref_row = ref_mv->row;
+  const int ref_col = ref_mv->col;
 
   int_mv bmv = pack_int_mv(ref_row, ref_col);
   int_mv new_bmv = bmv;
@@ -117,12 +94,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
   // Work out the start point for the search
   const uint8_t *best_address = in_what;
   const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
   int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
 #endif
-  unsigned int best_sad = INT_MAX;
+  // Starting position
+  unsigned int best_sad = start_mv_sad;
   int i, j, step;
 
   // Check the prerequisite cost function properties that are easy to check
@@ -131,10 +109,6 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
   assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
 
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
   *num00 = 0;
 
   for (i = 0, step = 0; step < tot_steps; step++) {
@@ -143,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       int8x16_t v_inside_d;
       uint32x4_t v_outside_d;
       int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       int64x2_t v_blocka[2];
 #else
       int32x4_t v_blocka[1];
@@ -164,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                     vreinterpretq_s32_s16(v_these_mv_w)));
 
       // If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
 #else
       horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
@@ -193,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 
       // Compute the SIMD pointer offsets.
       {
-#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64  //  sizeof(intptr_t) == 8
         // Load the offsets
         int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
         int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
@@ -214,13 +188,13 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 #endif
       }
 
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
+      sad_fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                         in_what_stride, (uint32_t *)&v_sad_d);
 
       // Look up the component cost of the residual motion vector
       {
         uint32_t cost[4];
-        int16_t __attribute__((aligned(16))) rowcol[8];
+        DECLARE_ALIGNED(16, int16_t, rowcol[8]);
         vst1q_s16(rowcol, v_diff_mv_w);
 
         // Note: This is a use case for gather instruction
@@ -260,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       // Find the minimum value and index horizontally in v_sad_d
       {
         uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
         local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
 #else
         uint32x2_t horiz_min_0 =
@@ -282,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
           uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
           v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
           local_best_idx = vminvq_u32(v_mask_d);
 #else
           horiz_min_0 =
@@ -306,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
     best_address = new_best_address;
 
     v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
     v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
     v_ba_d = vdupq_n_s32((intptr_t)best_address);
diff --git a/vp9/encoder/arm/neon/vp9_error_neon.c b/vp9/encoder/arm/neon/vp9_error_neon.c
index 1c7503139..0cf0bf250 100644
--- a/vp9/encoder/arm/neon/vp9_error_neon.c
+++ b/vp9/encoder/arm/neon/vp9_error_neon.c
@@ -12,30 +12,91 @@
 #include <assert.h>
 
 #include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
-int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff,
-                                int block_size) {
-  int64x2_t error = vdupq_n_s64(0);
+int64_t vp9_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff,
+                             intptr_t block_size, int64_t *ssz) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
 
-  assert(block_size >= 8);
-  assert((block_size % 8) == 0);
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
 
   do {
-    const int16x8_t c = vld1q_s16(coeff);
-    const int16x8_t d = vld1q_s16(dqcoeff);
-    const int16x8_t diff = vsubq_s16(c, d);
-    const int16x4_t diff_lo = vget_low_s16(diff);
-    const int16x4_t diff_hi = vget_high_s16(diff);
-    // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before
+    uint32x4_t err;
+    int32x4_t ssz0, ssz1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so we can store 4 in 32-bits before
     // accumulating them in 64-bits.
-    const int32x4_t err0 = vmull_s16(diff_lo, diff_lo);
-    const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi);
-    const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1));
-    error = vaddq_s64(error, err2);
-    coeff += 8;
-    dqcoeff += 8;
-    block_size -= 8;
+    err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0));
+    err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1));
+    err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64 = vpadalq_u32(err_u64, err);
+
+    // We can't do the same here as we're operating on signed integers, so we
+    // can store 2 15-bit diff before accumulating into 64-bits.
+    ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0));
+    ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz0);
+
+    ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1));
+    ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1));
+    ssz_s64 = vpadalq_s32(ssz_s64, ssz1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
+  } while (block_size != 0);
+
+  *ssz = horizontal_add_int64x2(ssz_s64);
+  return (int64_t)horizontal_add_uint64x2(err_u64);
+}
+
+int64_t vp9_block_error_fp_neon(const tran_low_t *coeff,
+                                const tran_low_t *dqcoeff, int block_size) {
+  uint64x2_t err_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    uint32x4_t err0, err1;
+
+    const int16x8_t c0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8);
+    const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff);
+    const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8);
+
+    const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0));
+    const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1));
+
+    // diff is 15-bits, the squares 30, so in theory we can store 4 in 32-bits
+    // before accumulating them in 64-bits. However splitting into 2 mull, mlal
+    // pairs is beneficial since it allows us to use both Neon
+    // multiply-accumulate pipes - on CPUs that have them - rather than having
+    // a single chain of 4 instructions executing serially.
+    err0 = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0));
+    err0 = vmlal_u16(err0, vget_high_u16(diff0), vget_high_u16(diff0));
+    err_u64[0] = vpadalq_u32(err_u64[0], err0);
+
+    err1 = vmull_u16(vget_low_u16(diff1), vget_low_u16(diff1));
+    err1 = vmlal_u16(err1, vget_high_u16(diff1), vget_high_u16(diff1));
+    err_u64[1] = vpadalq_u32(err_u64[1], err1);
+
+    coeff += 16;
+    dqcoeff += 16;
+    block_size -= 16;
   } while (block_size != 0);
 
-  return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1);
+  return horizontal_add_uint64x2(vaddq_u64(err_u64[0], err_u64[1]));
 }
diff --git a/vp9/encoder/arm/neon/vp9_highbd_error_neon.c b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
new file mode 100644
index 000000000..d9b183472
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_error_neon.c
@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+int64_t vp9_highbd_block_error_neon(const tran_low_t *coeff,
+                                    const tran_low_t *dqcoeff,
+                                    intptr_t block_size, int64_t *ssz, int bd) {
+  uint64x2_t err_u64 = vdupq_n_u64(0);
+  int64x2_t ssz_s64 = vdupq_n_s64(0);
+
+  const int shift = 2 * (bd - 8);
+  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
+
+  assert(block_size >= 16);
+  assert((block_size % 16) == 0);
+
+  do {
+    const int32x4_t c = load_tran_low_to_s32q(coeff);
+    const int32x4_t d = load_tran_low_to_s32q(dqcoeff);
+
+    const uint32x4_t diff = vreinterpretq_u32_s32(vabdq_s32(c, d));
+
+    err_u64 = vmlal_u32(err_u64, vget_low_u32(diff), vget_low_u32(diff));
+    err_u64 = vmlal_u32(err_u64, vget_high_u32(diff), vget_high_u32(diff));
+
+    ssz_s64 = vmlal_s32(ssz_s64, vget_low_s32(c), vget_low_s32(c));
+    ssz_s64 = vmlal_s32(ssz_s64, vget_high_s32(c), vget_high_s32(c));
+
+    coeff += 4;
+    dqcoeff += 4;
+    block_size -= 4;
+  } while (block_size != 0);
+
+  *ssz = (horizontal_add_int64x2(ssz_s64) + rounding) >> shift;
+  return ((int64_t)horizontal_add_uint64x2(err_u64) + rounding) >> shift;
+}
diff --git a/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
new file mode 100644
index 000000000..c3aef3c86
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
@@ -0,0 +1,872 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Compute (a-b)**2 for 8 pixels with size 16-bit
+static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
+                                       uint32_t *dst) {
+  const uint16x8_t a_reg = vld1q_u16(a);
+  const uint16x8_t b_reg = vld1q_u16(b);
+
+  uint16x8_t dist = vabdq_u16(a_reg, b_reg);
+  uint32x4_t dist_first = vmull_u16(vget_low_u16(dist), vget_low_u16(dist));
+  uint32x4_t dist_second = vmull_u16(vget_high_u16(dist), vget_high_u16(dist));
+
+  vst1q_u32(dst, dist_first);
+  vst1q_u32(dst + 4, dist_second);
+}
+
+// Sum up three neighboring distortions for the pixels
+static INLINE void highbd_get_sum_4(const uint32_t *dist, uint32x4_t *sum) {
+  uint32x4_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u32(dist);
+  dist_left = vld1q_u32(dist - 1);
+  dist_right = vld1q_u32(dist + 1);
+
+  *sum = vaddq_u32(dist_reg, dist_left);
+  *sum = vaddq_u32(*sum, dist_right);
+}
+
+static INLINE void highbd_get_sum_8(const uint32_t *dist, uint32x4_t *sum_first,
+                                    uint32x4_t *sum_second) {
+  highbd_get_sum_4(dist, sum_first);
+  highbd_get_sum_4(dist + 4, sum_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values, plus
+// however many values from y/uv plane are).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE void highbd_average_4(uint32x4_t *output, const uint32x4_t sum,
+                                    const uint32x4_t *mul_constants,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  const int64x2_t strength_s64 = vdupq_n_s64(-strength - 32);
+  const uint64x2_t rounding_u64 = vdupq_n_u64((uint64_t)rounding << 32);
+  const uint32x4_t weight_u32 = vdupq_n_u32(weight);
+  const uint32x4_t sixteen = vdupq_n_u32(16);
+  uint32x4_t sum2;
+
+  // modifier * 3 / index;
+  uint64x2_t sum_lo =
+      vmlal_u32(rounding_u64, vget_low_u32(sum), vget_low_u32(*mul_constants));
+  uint64x2_t sum_hi = vmlal_u32(rounding_u64, vget_high_u32(sum),
+                                vget_high_u32(*mul_constants));
+
+  // we cannot use vshrn_n_u64 as strength is not known at compile time.
+  sum_lo = vshlq_u64(sum_lo, strength_s64);
+  sum_hi = vshlq_u64(sum_hi, strength_s64);
+
+  sum2 = vcombine_u32(vmovn_u64(sum_lo), vmovn_u64(sum_hi));
+
+  // Multiply with the weight
+  sum2 = vminq_u32(sum2, sixteen);
+  sum2 = vsubq_u32(sixteen, sum2);
+  *output = vmulq_u32(sum2, weight_u32);
+}
+
+static INLINE void highbd_average_8(uint32x4_t *output_0, uint32x4_t *output_1,
+                                    const uint32x4_t sum_0_u32,
+                                    const uint32x4_t sum_1_u32,
+                                    const uint32x4_t *mul_constants_0,
+                                    const uint32x4_t *mul_constants_1,
+                                    const int strength, const int rounding,
+                                    const int weight) {
+  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
+                   weight);
+  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
+                   weight);
+}
+
+// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static INLINE void highbd_accumulate_and_store_8(
+    const uint32x4_t sum_first_u32, const uint32x4_t sum_second_u32,
+    const uint16_t *pred, uint16_t *count, uint32_t *accumulator) {
+  const uint16x8_t sum_u16 =
+      vcombine_u16(vqmovn_u32(sum_first_u32), vqmovn_u32(sum_second_u32));
+  uint16x8_t pred_u16 = vld1q_u16(pred);
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t pred_0_u32, pred_1_u32;
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  pred_0_u32 = vmovl_u16(vget_low_u16(pred_u16));
+  pred_1_u32 = vmovl_u16(vget_high_u16(pred_u16));
+
+  // Don't use sum_u16 as that produces different results to the C version
+  accum_0_u32 = vmlaq_u32(accum_0_u32, sum_first_u32, pred_0_u32);
+  accum_1_u32 = vmlaq_u32(accum_1_u32, sum_second_u32, pred_1_u32);
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void highbd_read_dist_4(const uint32_t *dist,
+                                      uint32x4_t *dist_reg) {
+  *dist_reg = vld1q_u32(dist);
+}
+
+static INLINE void highbd_read_dist_8(const uint32_t *dist,
+                                      uint32x4_t *reg_first,
+                                      uint32x4_t *reg_second) {
+  highbd_read_dist_4(dist, reg_first);
+  highbd_read_dist_4(dist + 4, reg_second);
+}
+
+static INLINE void highbd_read_chroma_dist_row_8(
+    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist,
+    uint32x4_t *u_first, uint32x4_t *u_second, uint32x4_t *v_first,
+    uint32x4_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 8 entries from chroma.
+    highbd_read_dist_8(u_dist, u_first, u_second);
+    highbd_read_dist_8(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint32x4_t u_reg, v_reg;
+    uint32x4x2_t pair;
+
+    highbd_read_dist_4(u_dist, &u_reg);
+
+    pair = vzipq_u32(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    highbd_read_dist_4(v_dist, &v_reg);
+
+    pair = vzipq_u32(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+static void highbd_apply_temporal_filter_luma_8(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
+    const uint32_t *const *neighbors_second, int top_weight,
+    int bottom_weight) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_first, mul_second;
+
+  uint32x4_t sum_row_1_first, sum_row_1_second;
+  uint32x4_t sum_row_2_first, sum_row_2_second;
+  uint32x4_t sum_row_3_first, sum_row_3_second;
+
+  uint32x4_t u_first, u_second;
+  uint32x4_t v_first, v_second;
+
+  uint32x4_t sum_row_first;
+  uint32x4_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(block_width == 8);
+
+  (void)block_width;
+
+  // First row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Add luma values
+  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
+  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
+  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
+  sum_row_first = vaddq_u32(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vaddq_u32(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                &v_first, &v_second);
+
+  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u32(neighbors_first[1]);
+  mul_second = vld1q_u32(neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      weight = bottom_weight;
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vaddq_u32(sum_row_first, sum_row_3_first);
+    sum_row_second = vaddq_u32(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                    &v_first, &v_second);
+
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vaddq_u32(sum_row_first, u_first);
+    sum_row_second = vaddq_u32(sum_row_second, u_second);
+    sum_row_first = vaddq_u32(sum_row_first, v_first);
+    sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+    // Get modifier and store result
+    highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                     sum_row_second, &mul_first, &mul_second, strength,
+                     rounding, weight);
+    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                  y_accum);
+
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u32(neighbors_first[0]);
+  mul_second = vld1q_u32(neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vaddq_u32(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vaddq_u32(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
+                                  &v_first, &v_second);
+  }
+
+  sum_row_first = vaddq_u32(sum_row_first, u_first);
+  sum_row_second = vaddq_u32(sum_row_second, u_second);
+  sum_row_first = vaddq_u32(sum_row_first, v_first);
+  sum_row_second = vaddq_u32(sum_row_second, v_second);
+
+  // Get modifier and store result
+  highbd_average_8(&sum_row_first, &sum_row_second, sum_row_first,
+                   sum_row_second, &mul_first, &mul_second, strength, rounding,
+                   weight);
+  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
+                                y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void highbd_apply_temporal_filter_luma(
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_first;
+  const uint32_t *const *neighbors_second;
+
+  // Left
+  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_luma_8(
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
+  }
+
+  // Right
+  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
+  highbd_apply_temporal_filter_luma_8(
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+      neighbors_first, neighbors_second, top_weight, bottom_weight);
+}
+
+// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
+// subsampling in x direction, then we have 16 lumas, else we have 8.
+static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
+    const uint32_t *y_dist, int ss_x, int ss_y, uint32x4_t *u_mod_fst,
+    uint32x4_t *u_mod_snd, uint32x4_t *v_mod_fst, uint32x4_t *v_mod_snd) {
+  uint32x4_t y_reg_fst, y_reg_snd;
+  if (!ss_x) {
+    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+      y_reg_fst = vaddq_u32(y_reg_fst, y_tmp_fst);
+      y_reg_snd = vaddq_u32(y_reg_snd, y_tmp_snd);
+    }
+  } else {
+    // Temporary
+    uint32x4_t y_fst, y_snd;
+    uint64x2_t y_fst64, y_snd64;
+
+    // First 8
+    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_fst = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+
+    // Second 8
+    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
+    if (ss_y == 1) {
+      uint32x4_t y_tmp_fst, y_tmp_snd;
+      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
+
+      y_fst = vaddq_u32(y_fst, y_tmp_fst);
+      y_snd = vaddq_u32(y_snd, y_tmp_snd);
+    }
+
+    y_fst64 = vpaddlq_u32(y_fst);
+    y_snd64 = vpaddlq_u32(y_snd);
+    y_reg_snd = vcombine_u32(vqmovn_u64(y_fst64), vqmovn_u64(y_snd64));
+  }
+
+  *u_mod_fst = vaddq_u32(*u_mod_fst, y_reg_fst);
+  *u_mod_snd = vaddq_u32(*u_mod_snd, y_reg_snd);
+  *v_mod_fst = vaddq_u32(*v_mod_fst, y_reg_fst);
+  *v_mod_snd = vaddq_u32(*v_mod_snd, y_reg_snd);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void highbd_apply_temporal_filter_chroma_8(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
+    int top_weight, int bottom_weight, const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  int weight = top_weight;
+
+  uint32x4_t mul_fst, mul_snd;
+
+  uint32x4_t u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
+  uint32x4_t v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
+  uint32x4_t u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
+  uint32x4_t v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
+
+  uint32x4_t u_sum_row_fst, v_sum_row_fst;
+  uint32x4_t u_sum_row_snd, v_sum_row_snd;
+
+  // Loop variable
+  unsigned int h;
+
+  (void)uv_block_width;
+
+  // First row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Add chroma values
+  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
+  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+
+  u_sum_row_fst = vaddq_u32(u_sum_row_2_fst, u_sum_row_3_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_2_snd, u_sum_row_3_snd);
+
+  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
+  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+
+  v_sum_row_fst = vaddq_u32(v_sum_row_2_fst, v_sum_row_3_fst);
+  v_sum_row_snd = vaddq_u32(v_sum_row_2_snd, v_sum_row_3_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul_fst = vld1q_u32(neighbors_fst[1]);
+  mul_snd = vld1q_u32(neighbors_snd[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        blk_fw += 2;
+      } else {
+        weight = bottom_weight;
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1_fst = u_sum_row_2_fst;
+    u_sum_row_2_fst = u_sum_row_3_fst;
+    u_sum_row_1_snd = u_sum_row_2_snd;
+    u_sum_row_2_snd = u_sum_row_3_snd;
+
+    v_sum_row_1_fst = v_sum_row_2_fst;
+    v_sum_row_2_fst = v_sum_row_3_fst;
+    v_sum_row_1_snd = v_sum_row_2_snd;
+    v_sum_row_2_snd = v_sum_row_3_snd;
+
+    // Add chroma values
+    u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
+    u_sum_row_fst = vaddq_u32(u_sum_row_fst, u_sum_row_3_fst);
+    u_sum_row_snd = vaddq_u32(u_sum_row_snd, u_sum_row_3_snd);
+
+    v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
+    v_sum_row_fst = vaddq_u32(v_sum_row_fst, v_sum_row_3_fst);
+    v_sum_row_snd = vaddq_u32(v_sum_row_snd, v_sum_row_3_snd);
+
+    // Add luma values
+    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                         &u_sum_row_snd, &v_sum_row_fst,
+                                         &v_sum_row_snd);
+
+    // Get modifier and store result
+    if (blk_fw) {
+      highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+      highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                       rounding, blk_fw[0]);
+      highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                       rounding, blk_fw[1]);
+
+    } else {
+      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                       u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                       v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                       weight);
+    }
+
+    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                  u_accum);
+    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                  v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul_fst = vld1q_u32(neighbors_fst[0]);
+  mul_snd = vld1q_u32(neighbors_snd[0]);
+
+  // Shift the rows up
+  u_sum_row_1_fst = u_sum_row_2_fst;
+  u_sum_row_2_fst = u_sum_row_3_fst;
+  u_sum_row_1_snd = u_sum_row_2_snd;
+  u_sum_row_2_snd = u_sum_row_3_snd;
+
+  v_sum_row_1_fst = v_sum_row_2_fst;
+  v_sum_row_2_fst = v_sum_row_3_fst;
+  v_sum_row_1_snd = v_sum_row_2_snd;
+  v_sum_row_2_snd = v_sum_row_3_snd;
+
+  // Add chroma values
+  u_sum_row_fst = vaddq_u32(u_sum_row_1_fst, u_sum_row_2_fst);
+  v_sum_row_fst = vaddq_u32(v_sum_row_1_fst, v_sum_row_2_fst);
+  u_sum_row_snd = vaddq_u32(u_sum_row_1_snd, u_sum_row_2_snd);
+  v_sum_row_snd = vaddq_u32(v_sum_row_1_snd, v_sum_row_2_snd);
+
+  // Add luma values
+  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
+                                       &u_sum_row_snd, &v_sum_row_fst,
+                                       &v_sum_row_snd);
+
+  // Get modifier and store result
+  if (blk_fw) {
+    highbd_average_4(&u_sum_row_fst, u_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&u_sum_row_snd, u_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+    highbd_average_4(&v_sum_row_fst, v_sum_row_fst, &mul_fst, strength,
+                     rounding, blk_fw[0]);
+    highbd_average_4(&v_sum_row_snd, v_sum_row_snd, &mul_snd, strength,
+                     rounding, blk_fw[1]);
+
+  } else {
+    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, u_sum_row_fst,
+                     u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, v_sum_row_fst,
+                     v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
+                     weight);
+  }
+
+  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
+                                u_accum);
+  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
+                                v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void highbd_apply_temporal_filter_chroma(
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const uint32_t *const *neighbors_fst;
+  const uint32_t *const *neighbors_snd;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    } else {
+      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+    } else {
+      highbd_apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    highbd_apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  highbd_apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
+}
+
+void vp9_highbd_apply_temporal_filter_neon(
+    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
+    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
+    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+
+  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint16_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint16_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 4 && strength <= 14 &&
+         "invalid adjusted temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
+      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                          y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                          u_dist_ptr + blk_col);
+      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                          v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                    block_height, ss_x, ss_y, strength, blk_fw,
+                                    use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                    u_dist_ptr, v_dist_ptr);
+
+  highbd_apply_temporal_filter_chroma(
+      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
+      strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
+      y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fcba..96d061436 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -11,11 +11,13 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
 #include "./vpx_config.h"
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -50,7 +52,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
 }
 
 static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
   const int16x4_t v_eobmax_3210 =
@@ -65,23 +67,21 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
 
   return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
 
-static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
-                                            const int16_t *quant_ptr,
-                                            const int16_t *dequant_ptr,
-                                            int16x8_t *round, int16x8_t *quant,
-                                            int16x8_t *dequant) {
-  *round = vld1q_s16(round_ptr);
-  *quant = vld1q_s16(quant_ptr);
+static VPX_FORCE_INLINE void load_fp_values(
+    const struct macroblock_plane *mb_plane, const int16_t *dequant_ptr,
+    int16x8_t *round, int16x8_t *quant, int16x8_t *dequant) {
+  *round = vld1q_s16(mb_plane->round_fp);
+  *quant = vld1q_s16(mb_plane->quant_fp);
   *dequant = vld1q_s16(dequant_ptr);
 }
 
 static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
                                               int16x8_t *v_quant,
                                               int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *v_round = vdupq_laneq_s16(*v_round, 1);
   *v_quant = vdupq_laneq_s16(*v_quant, 1);
   *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
@@ -117,27 +117,26 @@ static VPX_FORCE_INLINE void quantize_fp_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          const struct macroblock_plane *mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
   int16x8_t v_round, v_quant, v_dequant;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
-                 &v_dequant);
+  load_fp_values(mb_plane, dequant_ptr, &v_round, &v_quant, &v_dequant);
   // process dc and the first seven ac coeffs
   quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
                 dqcoeff_ptr, &v_eobmax);
 
   // now process the rest of the ac coeffs
   update_fp_values(&v_round, &v_quant, &v_dequant);
-  for (i = 8; i < count; i += 8) {
+  for (i = 8; i < n_coeffs; i += 8) {
     quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
                   qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
@@ -186,23 +185,22 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_8(
   *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   int16x8_t eob_max = vdupq_n_s16(-1);
   // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round_fp), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant_fp);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
   // dequant >> 2 is used similar to zbin as a threshold.
   int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
-  (void)scan;
-  (void)count;
+  (void)n_coeffs;
 
   // Process dc and the first seven ac coeffs.
   quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
@@ -258,23 +256,21 @@ highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const int16x4_t v_zero = vdup_n_s16(0);
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
-  const int16x4_t v_round = vld1_s16(round_ptr);
+  const int16x4_t v_round = vld1_s16(mb_plane->round_fp);
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
@@ -349,22 +345,21 @@ highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vp9_highbd_quantize_fp_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
-  const int16x4_t v_quant = vld1_s16(quant_ptr);
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
+  const int16x4_t v_quant = vld1_s16(mb_plane->quant_fp);
   const int16x4_t v_dequant = vld1_s16(dequant_ptr);
   const int16x4_t v_zero = vdup_n_s16(0);
   const int16x4_t v_round =
-      vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+      vqrdmulh_n_s16(vld1_s16(mb_plane->round_fp), (int16_t)(1 << 14));
   int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
   int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
   int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
   uint16x4_t v_mask_lo, v_mask_hi;
   int16x8_t v_eobmax = vdupq_n_s16(-1);
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // DC and first 3 AC
   v_mask_lo =
diff --git a/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
new file mode 100644
index 000000000..a651a15d9
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_temporal_filter_neon.c
@@ -0,0 +1,849 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
+
+// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
+// difference squared, and store as unsigned 16-bit integer to dst.
+static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
+                                uint16_t *dst) {
+  const uint8x8_t a_reg = vld1_u8(a);
+  const uint8x8_t b_reg = vld1_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(a_reg, b_reg);
+  dist_first = vmulq_u16(dist_first, dist_first);
+
+  vst1q_u16(dst, dist_first);
+}
+
+static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
+                                 uint16_t *dst) {
+  const uint8x16_t a_reg = vld1q_u8(a);
+  const uint8x16_t b_reg = vld1q_u8(b);
+
+  uint16x8_t dist_first = vabdl_u8(vget_low_u8(a_reg), vget_low_u8(b_reg));
+  uint16x8_t dist_second = vabdl_u8(vget_high_u8(a_reg), vget_high_u8(b_reg));
+  dist_first = vmulq_u16(dist_first, dist_first);
+  dist_second = vmulq_u16(dist_second, dist_second);
+
+  vst1q_u16(dst, dist_first);
+  vst1q_u16(dst + 8, dist_second);
+}
+
+static INLINE void read_dist_8(const uint16_t *dist, uint16x8_t *dist_reg) {
+  *dist_reg = vld1q_u16(dist);
+}
+
+static INLINE void read_dist_16(const uint16_t *dist, uint16x8_t *reg_first,
+                                uint16x8_t *reg_second) {
+  read_dist_8(dist, reg_first);
+  read_dist_8(dist + 8, reg_second);
+}
+
+// Average the value based on the number of values summed (9 for pixels away
+// from the border, 4 for pixels in corners, and 6 for other edge values).
+//
+// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
+// by weight.
+static INLINE uint16x8_t average_8(uint16x8_t sum,
+                                   const uint16x8_t *mul_constants,
+                                   const int strength, const int rounding,
+                                   const uint16x8_t *weight) {
+  const uint32x4_t rounding_u32 = vdupq_n_u32(rounding << 16);
+  const uint16x8_t weight_u16 = *weight;
+  const uint16x8_t sixteen = vdupq_n_u16(16);
+  const int32x4_t strength_u32 = vdupq_n_s32(-strength - 16);
+
+  // modifier * 3 / index;
+  uint32x4_t sum_hi =
+      vmull_u16(vget_low_u16(sum), vget_low_u16(*mul_constants));
+  uint32x4_t sum_lo =
+      vmull_u16(vget_high_u16(sum), vget_high_u16(*mul_constants));
+
+  sum_lo = vqaddq_u32(sum_lo, rounding_u32);
+  sum_hi = vqaddq_u32(sum_hi, rounding_u32);
+
+  // we cannot use vshrn_n_u32 as strength is not known at compile time.
+  sum_lo = vshlq_u32(sum_lo, strength_u32);
+  sum_hi = vshlq_u32(sum_hi, strength_u32);
+
+  sum = vcombine_u16(vmovn_u32(sum_hi), vmovn_u32(sum_lo));
+
+  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
+  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
+  // So this needs to use the epu16 version which did not come until SSE4.
+  sum = vminq_u16(sum, sixteen);
+  sum = vsubq_u16(sixteen, sum);
+  return vmulq_u16(sum, weight_u16);
+}
+
+// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
+static void accumulate_and_store_8(const uint16x8_t sum_u16,
+                                   const uint8_t *pred, uint16_t *count,
+                                   uint32_t *accumulator) {
+  uint16x8_t pred_u16 = vmovl_u8(vld1_u8(pred));
+  uint16x8_t count_u16 = vld1q_u16(count);
+  uint32x4_t accum_0_u32, accum_1_u32;
+
+  count_u16 = vqaddq_u16(count_u16, sum_u16);
+  vst1q_u16(count, count_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_u16), vget_low_u16(pred_u16));
+  accum_1_u32 =
+      vmlal_u16(accum_1_u32, vget_high_u16(sum_u16), vget_high_u16(pred_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+}
+
+static INLINE void accumulate_and_store_16(const uint16x8_t sum_0_u16,
+                                           const uint16x8_t sum_1_u16,
+                                           const uint8_t *pred, uint16_t *count,
+                                           uint32_t *accumulator) {
+  uint8x16_t pred_u8 = vld1q_u8(pred);
+  uint16x8_t pred_0_u16 = vmovl_u8(vget_low_u8(pred_u8));
+  uint16x8_t pred_1_u16 = vmovl_u8(vget_high_u8(pred_u8));
+  uint16x8_t count_0_u16 = vld1q_u16(count);
+  uint16x8_t count_1_u16 = vld1q_u16(count + 8);
+  uint32x4_t accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
+
+  count_0_u16 = vqaddq_u16(count_0_u16, sum_0_u16);
+  vst1q_u16(count, count_0_u16);
+  count_1_u16 = vqaddq_u16(count_1_u16, sum_1_u16);
+  vst1q_u16(count + 8, count_1_u16);
+
+  accum_0_u32 = vld1q_u32(accumulator);
+  accum_1_u32 = vld1q_u32(accumulator + 4);
+  accum_2_u32 = vld1q_u32(accumulator + 8);
+  accum_3_u32 = vld1q_u32(accumulator + 12);
+
+  accum_0_u32 =
+      vmlal_u16(accum_0_u32, vget_low_u16(sum_0_u16), vget_low_u16(pred_0_u16));
+  accum_1_u32 = vmlal_u16(accum_1_u32, vget_high_u16(sum_0_u16),
+                          vget_high_u16(pred_0_u16));
+  accum_2_u32 =
+      vmlal_u16(accum_2_u32, vget_low_u16(sum_1_u16), vget_low_u16(pred_1_u16));
+  accum_3_u32 = vmlal_u16(accum_3_u32, vget_high_u16(sum_1_u16),
+                          vget_high_u16(pred_1_u16));
+
+  vst1q_u32(accumulator, accum_0_u32);
+  vst1q_u32(accumulator + 4, accum_1_u32);
+  vst1q_u32(accumulator + 8, accum_2_u32);
+  vst1q_u32(accumulator + 12, accum_3_u32);
+}
+
+// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
+static INLINE void get_sum_8(const uint16_t *y_dist, uint16x8_t *sum) {
+  uint16x8_t dist_reg, dist_left, dist_right;
+
+  dist_reg = vld1q_u16(y_dist);
+  dist_left = vld1q_u16(y_dist - 1);
+  dist_right = vld1q_u16(y_dist + 1);
+
+  *sum = vqaddq_u16(dist_reg, dist_left);
+  *sum = vqaddq_u16(*sum, dist_right);
+}
+
+// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
+// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
+// the rest in sum_second.
+static INLINE void get_sum_16(const uint16_t *y_dist, uint16x8_t *sum_first,
+                              uint16x8_t *sum_second) {
+  get_sum_8(y_dist, sum_first);
+  get_sum_8(y_dist + 8, sum_second);
+}
+
+// Read in a row of chroma values corresponds to a row of 16 luma values.
+static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
+                                           const uint16_t *v_dist,
+                                           uint16x8_t *u_first,
+                                           uint16x8_t *u_second,
+                                           uint16x8_t *v_first,
+                                           uint16x8_t *v_second) {
+  if (!ss_x) {
+    // If there is no chroma subsampling in the horizontal direction, then we
+    // need to load 16 entries from chroma.
+    read_dist_16(u_dist, u_first, u_second);
+    read_dist_16(v_dist, v_first, v_second);
+  } else {  // ss_x == 1
+    // Otherwise, we only need to load 8 entries
+    uint16x8_t u_reg, v_reg;
+    uint16x8x2_t pair;
+
+    read_dist_8(u_dist, &u_reg);
+
+    pair = vzipq_u16(u_reg, u_reg);
+    *u_first = pair.val[0];
+    *u_second = pair.val[1];
+
+    read_dist_8(v_dist, &v_reg);
+
+    pair = vzipq_u16(v_reg, v_reg);
+    *v_first = pair.val[0];
+    *v_second = pair.val[1];
+  }
+}
+
+// Add a row of luma distortion to 8 corresponding chroma mods.
+static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
+                                                 int ss_x, int ss_y,
+                                                 uint16x8_t *u_mod,
+                                                 uint16x8_t *v_mod) {
+  uint16x8_t y_reg;
+  if (!ss_x) {
+    read_dist_8(y_dist, &y_reg);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp;
+      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
+
+      y_reg = vqaddq_u16(y_reg, y_tmp);
+    }
+  } else {
+    uint16x8_t y_first, y_second;
+    uint32x4_t y_first32, y_second32;
+
+    read_dist_16(y_dist, &y_first, &y_second);
+    if (ss_y == 1) {
+      uint16x8_t y_tmp_0, y_tmp_1;
+      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
+
+      y_first = vqaddq_u16(y_first, y_tmp_0);
+      y_second = vqaddq_u16(y_second, y_tmp_1);
+    }
+
+    y_first32 = vpaddlq_u16(y_first);
+    y_second32 = vpaddlq_u16(y_second);
+
+    y_reg = vcombine_u16(vqmovn_u32(y_first32), vqmovn_u32(y_second32));
+  }
+
+  *u_mod = vqaddq_u16(*u_mod, y_reg);
+  *v_mod = vqaddq_u16(*v_mod, y_reg);
+}
+
+// Apply temporal filter to the luma components. This performs temporal
+// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
+// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_luma_16(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
+    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+  uint16x8_t weight_first, weight_second;
+
+  uint16x8_t mul_first, mul_second;
+
+  uint16x8_t sum_row_1_first, sum_row_1_second;
+  uint16x8_t sum_row_2_first, sum_row_2_second;
+  uint16x8_t sum_row_3_first, sum_row_3_second;
+
+  uint16x8_t u_first, u_second;
+  uint16x8_t v_first, v_second;
+
+  uint16x8_t sum_row_first;
+  uint16x8_t sum_row_second;
+
+  // Loop variables
+  unsigned int h;
+
+  assert(strength >= 0);
+  assert(strength <= 6);
+
+  assert(block_width == 16);
+  (void)block_width;
+
+  // Initialize the weights
+  if (blk_fw) {
+    weight_first = vdupq_n_u16(blk_fw[0]);
+    weight_second = vdupq_n_u16(blk_fw[1]);
+  } else {
+    weight_first = vdupq_n_u16(top_weight);
+    weight_second = weight_first;
+  }
+
+  // First row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Add luma values
+  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
+  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+  sum_row_first = vqaddq_u16(sum_row_2_first, sum_row_3_first);
+  sum_row_second = vqaddq_u16(sum_row_2_second, sum_row_3_second);
+
+  // Add chroma values
+  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                          &v_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+
+  y_pre += y_pre_stride;
+  y_count += y_pre_stride;
+  y_accum += y_pre_stride;
+  y_dist += DIST_STRIDE;
+
+  u_dist += DIST_STRIDE;
+  v_dist += DIST_STRIDE;
+
+  // Then all the rows except the last one
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[1]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[1]);
+
+  for (h = 1; h < block_height - 1; ++h) {
+    // Move the weight to bottom half
+    if (!use_whole_blk && h == block_height / 2) {
+      if (blk_fw) {
+        weight_first = vdupq_n_u16(blk_fw[2]);
+        weight_second = vdupq_n_u16(blk_fw[3]);
+      } else {
+        weight_first = vdupq_n_u16(bottom_weight);
+        weight_second = weight_first;
+      }
+    }
+    // Shift the rows up
+    sum_row_1_first = sum_row_2_first;
+    sum_row_1_second = sum_row_2_second;
+    sum_row_2_first = sum_row_3_first;
+    sum_row_2_second = sum_row_3_second;
+
+    // Add luma values to the modifier
+    sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+    sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
+
+    sum_row_first = vqaddq_u16(sum_row_first, sum_row_3_first);
+    sum_row_second = vqaddq_u16(sum_row_second, sum_row_3_second);
+
+    // Add chroma values to the modifier
+    if (ss_y == 0 || h % 2 == 0) {
+      // Only calculate the new chroma distortion if we are at a pixel that
+      // corresponds to a new chroma row
+      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
+                              &v_first, &v_second);
+      u_dist += DIST_STRIDE;
+      v_dist += DIST_STRIDE;
+    }
+
+    sum_row_first = vqaddq_u16(sum_row_first, u_first);
+    sum_row_second = vqaddq_u16(sum_row_second, u_second);
+    sum_row_first = vqaddq_u16(sum_row_first, v_first);
+    sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+    // Get modifier and store result
+    sum_row_first =
+        average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+    sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                               &weight_second);
+    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                            y_accum);
+    y_pre += y_pre_stride;
+    y_count += y_pre_stride;
+    y_accum += y_pre_stride;
+    y_dist += DIST_STRIDE;
+  }
+
+  // The last row
+  mul_first = vld1q_u16((const uint16_t *)neighbors_first[0]);
+  mul_second = vld1q_u16((const uint16_t *)neighbors_second[0]);
+
+  // Shift the rows up
+  sum_row_1_first = sum_row_2_first;
+  sum_row_1_second = sum_row_2_second;
+  sum_row_2_first = sum_row_3_first;
+  sum_row_2_second = sum_row_3_second;
+
+  // Add luma values to the modifier
+  sum_row_first = vqaddq_u16(sum_row_1_first, sum_row_2_first);
+  sum_row_second = vqaddq_u16(sum_row_1_second, sum_row_2_second);
+
+  // Add chroma values to the modifier
+  if (ss_y == 0) {
+    // Only calculate the new chroma distortion if we are at a pixel that
+    // corresponds to a new chroma row
+    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
+                            &v_second);
+  }
+
+  sum_row_first = vqaddq_u16(sum_row_first, u_first);
+  sum_row_second = vqaddq_u16(sum_row_second, u_second);
+  sum_row_first = vqaddq_u16(sum_row_first, v_first);
+  sum_row_second = vqaddq_u16(sum_row_second, v_second);
+
+  // Get modifier and store result
+  sum_row_first =
+      average_8(sum_row_first, &mul_first, strength, rounding, &weight_first);
+  sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding,
+                             &weight_second);
+  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
+                          y_accum);
+}
+
+// Perform temporal filter for the luma component.
+static void apply_temporal_filter_luma(
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
+  const unsigned int mid_width = block_width >> 1,
+                     last_width = block_width - blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors_first;
+  const int16_t *const *neighbors_second;
+
+  if (block_width == 16) {
+    // Special Case: The block width is 16 and we are operating on a row of 16
+    // chroma pixels. In this case, we can't use the usual left-middle-right
+    // pattern. We also don't support splitting now.
+    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+    if (use_whole_blk) {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_luma_16(
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
+  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
+  for (; blk_col < mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; blk_col < last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_luma_16(
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
+  }
+
+  // Right
+  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
+  apply_temporal_filter_luma_16(
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+      neighbors_second, top_weight, bottom_weight, NULL);
+}
+
+// Apply temporal filter to the chroma components. This performs temporal
+// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
+// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
+// else use top_weight for top half, and bottom weight for bottom half.
+static void apply_temporal_filter_chroma_8(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
+    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors, int top_weight, int bottom_weight,
+    const int *blk_fw) {
+  const int rounding = (1 << strength) >> 1;
+
+  uint16x8_t weight;
+
+  uint16x8_t mul;
+
+  uint16x8_t u_sum_row_1, u_sum_row_2, u_sum_row_3;
+  uint16x8_t v_sum_row_1, v_sum_row_2, v_sum_row_3;
+
+  uint16x8_t u_sum_row, v_sum_row;
+
+  // Loop variable
+  unsigned int h;
+
+  // Initialize weight
+  if (blk_fw) {
+    weight = vcombine_u16(vdup_n_u16(blk_fw[0]), vdup_n_u16(blk_fw[1]));
+  } else {
+    weight = vdupq_n_u16(top_weight);
+  }
+
+  // First row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Add chroma values
+  get_sum_8(u_dist, &u_sum_row_2);
+  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+
+  u_sum_row = vqaddq_u16(u_sum_row_2, u_sum_row_3);
+
+  get_sum_8(v_dist, &v_sum_row_2);
+  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+
+  v_sum_row = vqaddq_u16(v_sum_row_2, v_sum_row_3);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+  u_pre += uv_pre_stride;
+  u_dist += DIST_STRIDE;
+  v_pre += uv_pre_stride;
+  v_dist += DIST_STRIDE;
+  u_count += uv_pre_stride;
+  u_accum += uv_pre_stride;
+  v_count += uv_pre_stride;
+  v_accum += uv_pre_stride;
+
+  y_dist += DIST_STRIDE * (1 + ss_y);
+
+  // Then all the rows except the last one
+  mul = vld1q_u16((const uint16_t *)neighbors[1]);
+
+  for (h = 1; h < uv_block_height - 1; ++h) {
+    // Move the weight pointer to the bottom half of the blocks
+    if (h == uv_block_height / 2) {
+      if (blk_fw) {
+        weight = vcombine_u16(vdup_n_u16(blk_fw[2]), vdup_n_u16(blk_fw[3]));
+      } else {
+        weight = vdupq_n_u16(bottom_weight);
+      }
+    }
+
+    // Shift the rows up
+    u_sum_row_1 = u_sum_row_2;
+    u_sum_row_2 = u_sum_row_3;
+
+    v_sum_row_1 = v_sum_row_2;
+    v_sum_row_2 = v_sum_row_3;
+
+    // Add chroma values
+    u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
+    u_sum_row = vqaddq_u16(u_sum_row, u_sum_row_3);
+
+    v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
+    v_sum_row = vqaddq_u16(v_sum_row, v_sum_row_3);
+
+    // Add luma values
+    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+    // Get modifier and store result
+    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+
+    u_pre += uv_pre_stride;
+    u_dist += DIST_STRIDE;
+    v_pre += uv_pre_stride;
+    v_dist += DIST_STRIDE;
+    u_count += uv_pre_stride;
+    u_accum += uv_pre_stride;
+    v_count += uv_pre_stride;
+    v_accum += uv_pre_stride;
+
+    y_dist += DIST_STRIDE * (1 + ss_y);
+  }
+
+  // The last row
+  mul = vld1q_u16((const uint16_t *)neighbors[0]);
+
+  // Shift the rows up
+  u_sum_row_1 = u_sum_row_2;
+  u_sum_row_2 = u_sum_row_3;
+
+  v_sum_row_1 = v_sum_row_2;
+  v_sum_row_2 = v_sum_row_3;
+
+  // Add chroma values
+  u_sum_row = vqaddq_u16(u_sum_row_1, u_sum_row_2);
+  v_sum_row = vqaddq_u16(v_sum_row_1, v_sum_row_2);
+
+  // Add luma values
+  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
+
+  // Get modifier and store result
+  u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight);
+  v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight);
+
+  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
+  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
+}
+
+// Perform temporal filter for the chroma components.
+static void apply_temporal_filter_chroma(
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
+  const unsigned int uv_width = block_width >> ss_x,
+                     uv_height = block_height >> ss_y;
+
+  unsigned int blk_col = 0, uv_blk_col = 0;
+  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
+  const unsigned int uv_mid_width = uv_width >> 1,
+                     uv_last_width = uv_width - uv_blk_col_step;
+  int top_weight = blk_fw[0],
+      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
+  const int16_t *const *neighbors;
+
+  if (uv_width == 8) {
+    // Special Case: We are subsampling in x direction on a 16x16 block. Since
+    // we are operating on a row of 8 chroma pixels, we can't use the usual
+    // left-middle-right pattern.
+    assert(ss_x);
+
+    if (ss_y) {
+      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    } else {
+      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
+    }
+
+    if (use_whole_blk) {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
+    } else {
+      apply_temporal_filter_chroma_8(
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
+    }
+
+    return;
+  }
+
+  // Left
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+
+  blk_col += blk_col_step;
+  uv_blk_col += uv_blk_col_step;
+
+  // Middle First
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
+  }
+
+  for (; uv_blk_col < uv_mid_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  if (!use_whole_blk) {
+    top_weight = blk_fw[1];
+    bottom_weight = blk_fw[3];
+  }
+
+  // Middle Second
+  for (; uv_blk_col < uv_last_width;
+       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
+    apply_temporal_filter_chroma_8(
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
+  }
+
+  // Right
+  if (ss_x && ss_y) {
+    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else if (ss_x || ss_y) {
+    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
+  } else {
+    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
+  }
+
+  apply_temporal_filter_chroma_8(
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+      bottom_weight, NULL);
+}
+
+void vp9_apply_temporal_filter_neon(
+    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
+    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
+    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
+    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
+    int ss_x, int ss_y, int strength, const int *const blk_fw,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count) {
+  const unsigned int chroma_height = block_height >> ss_y,
+                     chroma_width = block_width >> ss_x;
+
+  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
+  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
+  const int *blk_fw_ptr = blk_fw;
+
+  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
+           *v_dist_ptr = v_dist + 1;
+  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
+  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
+
+  // Loop variables
+  unsigned int row, blk_col;
+
+  assert(block_width <= BW && "block width too large");
+  assert(block_height <= BH && "block height too large");
+  assert(block_width % 16 == 0 && "block width must be multiple of 16");
+  assert(block_height % 2 == 0 && "block height must be even");
+  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
+         "invalid chroma subsampling");
+  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
+  assert(blk_fw[0] >= 0 && "filter weight must be positive");
+  assert(
+      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
+      "subblock filter weight must be positive");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
+  assert(
+      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
+      "subblock filter weight must be less than 2");
+
+  // Precompute the difference squared
+  for (row = 0; row < block_height; row++) {
+    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
+      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
+                    y_dist_ptr + blk_col);
+    }
+    y_src_ptr += y_src_stride;
+    y_pre_ptr += y_pre_stride;
+    y_dist_ptr += DIST_STRIDE;
+  }
+
+  for (row = 0; row < chroma_height; row++) {
+    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
+      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
+                   u_dist_ptr + blk_col);
+      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
+                   v_dist_ptr + blk_col);
+    }
+
+    u_src_ptr += uv_src_stride;
+    u_pre_ptr += uv_pre_stride;
+    u_dist_ptr += DIST_STRIDE;
+    v_src_ptr += uv_src_stride;
+    v_pre_ptr += uv_pre_stride;
+    v_dist_ptr += DIST_STRIDE;
+  }
+
+  y_dist_ptr = y_dist + 1;
+  u_dist_ptr = u_dist + 1;
+  v_dist_ptr = v_dist + 1;
+
+  apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                             ss_x, ss_y, strength, blk_fw_ptr, use_whole_blk,
+                             y_accum, y_count, y_dist_ptr, u_dist_ptr,
+                             v_dist_ptr);
+
+  apply_temporal_filter_chroma(u_pre, v_pre, uv_pre_stride, block_width,
+                               block_height, ss_x, ss_y, strength, blk_fw_ptr,
+                               use_whole_blk, u_accum, u_count, v_accum,
+                               v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
+}
diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c
index bd3812036..ef3423f8e 100644
--- a/vp9/encoder/vp9_aq_complexity.c
+++ b/vp9/encoder/vp9_aq_complexity.c
@@ -87,7 +87,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) {
           &cpi->rc, cm->frame_type, cm->base_qindex,
           aq_c_q_adj_factor[aq_strength][segment], cm->bit_depth);
 
-      // For AQ complexity mode, we dont allow Q0 in a segment if the base
+      // For AQ complexity mode, we don't allow Q0 in a segment if the base
       // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment
       // Q delta is sometimes applied without going back around the rd loop.
       // This could lead to an illegal combination of partition size and q.
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index a84c8b524..ca56d14aa 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -169,8 +169,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         vpx_write_bit(w, p->extra & 1);
       } else {  // t >= TWO_TOKEN && t < EOB_TOKEN
         const struct vp9_token *const a = &vp9_coef_encodings[t];
-        const int v = a->value;
-        const int n = a->len;
+        int v = a->value;
+        int n = a->len;
         const int e = p->extra;
         vpx_write(w, 1, context_tree[2]);
         vp9_write_tree(w, vp9_coef_con_tree,
@@ -179,8 +179,8 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
         if (t >= CATEGORY1_TOKEN) {
           const vp9_extra_bit *const b = &extra_bits[t];
           const unsigned char *pb = b->prob;
-          int v = e >> 1;
-          int n = b->len;  // number of bits in v, assumed nonzero
+          v = e >> 1;
+          n = b->len;  // number of bits in v, assumed nonzero
           do {
             const int bb = (v >> --n) & 1;
             vpx_write(w, bb, *pb++);
@@ -599,7 +599,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                const vpx_prob upd = DIFF_UPDATE_PROB;
                 int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
@@ -968,13 +967,13 @@ static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data,
                   vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+    CHECK_MEM_ERROR(&cm->error, cpi->vp9_bitstream_worker_data[i].dest,
                     vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
 }
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 20294b4b9..7fa00cd19 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -13,6 +13,7 @@
 
 #include "vpx_util/vpx_thread.h"
 
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
 
@@ -24,7 +25,7 @@ typedef struct {
   unsigned int sse;
   int sum;
   unsigned int var;
-} diff;
+} Diff;
 
 struct macroblock_plane {
   DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
@@ -33,8 +34,8 @@ struct macroblock_plane {
   uint16_t *eobs;
   struct buf_2d src;
 
-  // Quantizer setings
-  DECLARE_ALIGNED(16, int16_t, round_fp[8]);
+  // Quantizer settings
+  int16_t *round_fp;
   int16_t *quant_fp;
   int16_t *quant;
   int16_t *quant_shift;
@@ -78,16 +79,16 @@ struct macroblock {
   int skip_recode;
   int skip_optimize;
   int q_index;
-  int block_qcoeff_opt;
+  double log_block_src_var;
   int block_tx_domain;
 
   // The equivalent error at the current rdmult of one whole bit (not one
   // bitcost unit).
   int errorperbit;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for large blocks.
   int sadperbit16;
-  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // The equivalent SAD error of one (whole) bit at the current quantizer
   // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
@@ -127,7 +128,7 @@ struct macroblock {
   // from extending outside the UMV borders
   MvLimits mv_limits;
 
-  // Notes transform blocks where no coefficents are coded.
+  // Notes transform blocks where no coefficients are coded.
   // Set during mode selection. Read during block encoding.
   uint8_t zcoeff_blk[TX_SIZES][256];
 
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index b74b9027c..42073f756 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -25,16 +25,17 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
   int i, k;
   ctx->num_4x4_blk = num_blk;
 
-  CHECK_MEM_ERROR(cm, ctx->zcoeff_blk, vpx_calloc(num_blk, sizeof(uint8_t)));
+  CHECK_MEM_ERROR(&cm->error, ctx->zcoeff_blk,
+                  vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     for (k = 0; k < 3; ++k) {
-      CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->qcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->qcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->dqcoeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->dqcoeff[i][k])));
-      CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
+      CHECK_MEM_ERROR(&cm->error, ctx->eobs[i][k],
                       vpx_memalign(32, num_blk * sizeof(*ctx->eobs[i][k])));
       ctx->coeff_pbuf[i][k] = ctx->coeff[i][k];
       ctx->qcoeff_pbuf[i][k] = ctx->qcoeff[i][k];
@@ -100,10 +101,10 @@ void vp9_setup_pc_tree(VP9_COMMON *cm, ThreadData *td) {
   int nodes;
 
   vpx_free(td->leaf_tree);
-  CHECK_MEM_ERROR(cm, td->leaf_tree,
+  CHECK_MEM_ERROR(&cm->error, td->leaf_tree,
                   vpx_calloc(leaf_nodes, sizeof(*td->leaf_tree)));
   vpx_free(td->pc_tree);
-  CHECK_MEM_ERROR(cm, td->pc_tree,
+  CHECK_MEM_ERROR(&cm->error, td->pc_tree,
                   vpx_calloc(tree_nodes, sizeof(*td->pc_tree)));
 
   this_pc = &td->pc_tree[0];
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c
index 77d72396a..e5dffa90a 100644
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -319,7 +319,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
   filter_mbd->plane[2].dst.stride =
       denoiser->mc_running_avg_y[denoise_layer_idx].uv_stride;
 
-  set_ref_ptrs(cm, filter_mbd, saved_frame, NONE);
+  set_ref_ptrs(cm, filter_mbd, saved_frame, NO_REF_FRAME);
   vp9_build_inter_predictors_sby(filter_mbd, mi_row, mi_col, bs);
 
   // Restore everything to its original state
@@ -387,7 +387,7 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col,
           consec_zeromv = VPXMIN(cpi->consec_zero_mv[bl_index], consec_zeromv);
           // No need to keep checking 8x8 blocks if any of the sub-blocks
           // has small consec_zeromv (since threshold for no_skin based on
-          // zero/small motion in skin detection is high, i.e, > 4).
+          // zero/small motion in skin detection is high, i.e., > 4).
           if (consec_zeromv < 4) {
             i = ymis;
             break;
@@ -634,11 +634,11 @@ int vp9_denoiser_alloc(VP9_COMMON *cm, struct SVC *svc, VP9_DENOISER *denoiser,
   denoiser->num_ref_frames = use_svc ? SVC_REF_FRAMES : NONSVC_REF_FRAMES;
   init_num_ref_frames = use_svc ? MAX_REF_FRAMES : NONSVC_REF_FRAMES;
   denoiser->num_layers = num_layers;
-  CHECK_MEM_ERROR(cm, denoiser->running_avg_y,
+  CHECK_MEM_ERROR(&cm->error, denoiser->running_avg_y,
                   vpx_calloc(denoiser->num_ref_frames * num_layers,
                              sizeof(denoiser->running_avg_y[0])));
   CHECK_MEM_ERROR(
-      cm, denoiser->mc_running_avg_y,
+      &cm->error, denoiser->mc_running_avg_y,
       vpx_calloc(num_layers, sizeof(denoiser->mc_running_avg_y[0])));
 
   for (layer = 0; layer < num_layers; ++layer) {
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 1483ac069..7ff5f00ed 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -349,17 +349,17 @@ typedef struct {
   int32_t sum_error;
   int log2_count;
   int variance;
-} var;
+} Var;
 
 typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
+  Var none;
+  Var horz[2];
+  Var vert[2];
 } partition_variance;
 
 typedef struct {
   partition_variance part_variances;
-  var split[4];
+  Var split[4];
 } v4x4;
 
 typedef struct {
@@ -384,7 +384,7 @@ typedef struct {
 
 typedef struct {
   partition_variance *part_variances;
-  var *split[4];
+  Var *split[4];
 } variance_node;
 
 typedef enum {
@@ -436,13 +436,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
 }
 
 // Set variance values given sum square error, sum error, count.
-static void fill_variance(uint32_t s2, int32_t s, int c, var *v) {
+static void fill_variance(uint32_t s2, int32_t s, int c, Var *v) {
   v->sum_square_error = s2;
   v->sum_error = s;
   v->log2_count = c;
 }
 
-static void get_variance(var *v) {
+static void get_variance(Var *v) {
   v->variance =
       (int)(256 * (v->sum_square_error -
                    (uint32_t)(((int64_t)v->sum_error * v->sum_error) >>
@@ -450,7 +450,7 @@ static void get_variance(var *v) {
             v->log2_count);
 }
 
-static void sum_2_variances(const var *a, const var *b, var *r) {
+static void sum_2_variances(const Var *a, const Var *b, Var *r) {
   assert(a->log2_count == b->log2_count);
   fill_variance(a->sum_square_error + b->sum_square_error,
                 a->sum_error + b->sum_error, a->log2_count + 1, r);
@@ -1301,6 +1301,13 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
       (frame_is_intra_only(cm) ||
        (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
+
+  if (!is_key_frame) {
+    if (cm->frame_refs[LAST_FRAME - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+        cm->frame_refs[LAST_FRAME - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+      is_key_frame = 1;
+  }
+
   // Always use 4x4 partition for key frame.
   const int use_4x4_partition = frame_is_intra_only(cm);
   const int low_res = (cm->width <= 352 && cm->height <= 288);
@@ -1437,7 +1444,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
@@ -1545,7 +1552,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   }
 
   if (low_res && threshold_4x4avg < INT64_MAX)
-    CHECK_MEM_ERROR(cm, vt2, vpx_calloc(16, sizeof(*vt2)));
+    CHECK_MEM_ERROR(&cm->error, vt2, vpx_calloc(16, sizeof(*vt2)));
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
   for (i = 0; i < 4; i++) {
@@ -1706,7 +1713,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
           const int y16_idx = ((j >> 1) << 1);
           // For inter frames: if variance4x4downsample[] == 1 for this 16x16
           // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
+          // in set_vt_partitioning(), otherwise use vt.
           v16x16 *vtemp = (!is_key_frame && variance4x4downsample[i2 + j] == 1)
                               ? &vt2[i2 + j]
                               : &vt.split[i].split[j];
@@ -1863,8 +1870,8 @@ static void update_state(VP9_COMP *cpi, ThreadData *td, PICK_MODE_CONTEXT *ctx,
       vp9_update_mv_count(td);
 
       if (cm->interp_filter == SWITCHABLE) {
-        const int ctx = get_pred_context_switchable_interp(xd);
-        ++td->counts->switchable_interp[ctx][xdmi->interp_filter];
+        const int ctx_interp = get_pred_context_switchable_interp(xd);
+        ++td->counts->switchable_interp[ctx_interp][xdmi->interp_filter];
       }
     }
 
@@ -1924,7 +1931,7 @@ static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
   mi->skip = 1;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   mi->interp_filter = filter_ref;
 
@@ -1980,6 +1987,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   int64_t best_rd = INT64_MAX;
 
   vpx_clear_system_state();
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, rd_pick_sb_modes_time);
+#endif
 
   // Use the lower precision, but faster, 32x32 fdct for mode selection.
   x->use_lp32x32fdct = 1;
@@ -2018,20 +2028,20 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
   // Save rdmult before it might be changed, so it can be restored later.
   orig_rdmult = x->rdmult;
 
-  if ((cpi->sf.tx_domain_thresh > 0.0) || (cpi->sf.quant_opt_thresh > 0.0)) {
+  if ((cpi->sf.tx_domain_thresh > 0.0) ||
+      (cpi->sf.trellis_opt_tx_rd.thresh > 0.0)) {
     double logvar = vp9_log_block_var(cpi, x, bsize);
-    // Check block complexity as part of descision on using pixel or transform
+    // Check block complexity as part of decision on using pixel or transform
     // domain distortion in rd tests.
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion &&
                          (logvar >= cpi->sf.tx_domain_thresh);
 
-    // Check block complexity as part of descision on using quantized
-    // coefficient optimisation inside the rd loop.
-    x->block_qcoeff_opt =
-        cpi->sf.allow_quant_coeff_opt && (logvar <= cpi->sf.quant_opt_thresh);
+    // Store block complexity to decide on using quantized coefficient
+    // optimization inside the rd loop.
+    x->log_block_src_var = logvar;
   } else {
     x->block_tx_domain = cpi->sf.allow_txfm_domain_distortion;
-    x->block_qcoeff_opt = cpi->sf.allow_quant_coeff_opt;
+    x->log_block_src_var = 0.0;
   }
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
@@ -2047,15 +2057,27 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
   } else {
     if (bsize >= BLOCK_8X8) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
       if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
         vp9_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
       else
         vp9_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                   bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sb_time);
+#endif
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
       vp9_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col, rd_cost,
                                     bsize, ctx, best_rd);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, vp9_rd_pick_inter_mode_sub8x8_time);
+#endif
     }
   }
 
@@ -2078,6 +2100,9 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   ctx->rate = rd_cost->rate;
   ctx->dist = rd_cost->dist;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, rd_pick_sb_modes_time);
+#endif
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -2414,16 +2439,16 @@ static void set_source_var_based_partition(VP9_COMP *cpi,
       (row8x8_remaining >= MI_BLOCK_SIZE)) {
     int i, j;
     int index;
-    diff d32[4];
+    Diff d32[4];
     const int offset = (mi_row >> 1) * cm->mb_cols + (mi_col >> 1);
     int is_larger_better = 0;
     int use32x32 = 0;
     unsigned int thr = cpi->source_var_thresh;
 
-    memset(d32, 0, 4 * sizeof(diff));
+    memset(d32, 0, sizeof(d32));
 
     for (i = 0; i < 4; i++) {
-      diff *d16[4];
+      Diff *d16[4];
 
       for (j = 0; j < 4; j++) {
         int b_mi_row = coord_lookup[i * 4 + j].row;
@@ -2730,10 +2755,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_row + (mi_step >> 1) < cm->mi_rows) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
+        PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
         pc_tree->horizontal[1].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(cpi, tile_data, x, mi_row + (mi_step >> 1), mi_col,
                          &tmp_rdc, subsize, &pc_tree->horizontal[1], INT_MAX,
@@ -2754,10 +2779,10 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       if (last_part_rdc.rate != INT_MAX && bsize >= BLOCK_8X8 &&
           mi_col + (mi_step >> 1) < cm->mi_cols) {
         RD_COST tmp_rdc;
-        PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
+        PICK_MODE_CONTEXT *vctx = &pc_tree->vertical[0];
         vp9_rd_cost_init(&tmp_rdc);
-        update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+        update_state(cpi, td, vctx, mi_row, mi_col, subsize, 0);
+        encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, vctx);
         pc_tree->vertical[bsize > BLOCK_8X8].skip_ref_frame_mask = 0;
         rd_pick_sb_modes(
             cpi, tile_data, x, mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
@@ -2829,8 +2854,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
       int x_idx = (i & 1) * (mi_step >> 1);
       int y_idx = (i >> 1) * (mi_step >> 1);
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
@@ -3036,14 +3059,12 @@ static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, int mi_row,
   min_size = BLOCK_64X64;
   max_size = BLOCK_4X4;
 
-  if (prev_mi) {
-    for (idy = 0; idy < mi_height; ++idy) {
-      for (idx = 0; idx < mi_width; ++idx) {
-        mi = prev_mi[idy * cm->mi_stride + idx];
-        bs = mi ? mi->sb_type : bsize;
-        min_size = VPXMIN(min_size, bs);
-        max_size = VPXMAX(max_size, bs);
-      }
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      mi = prev_mi[idy * cm->mi_stride + idx];
+      bs = mi ? mi->sb_type : bsize;
+      min_size = VPXMIN(min_size, bs);
+      max_size = VPXMAX(max_size, bs);
     }
   }
 
@@ -3189,7 +3210,7 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
       left_par = 1;
   }
 
-  if (prev_mi) {
+  if (prev_mi[0]) {
     context_size = prev_mi[0]->sb_type;
     if (context_size < bsize)
       last_par = 2;
@@ -3422,18 +3443,23 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   MV ref_mv_full = { ref_mv.row >> 3, ref_mv.col >> 3 };
   MV best_mv = { 0, 0 };
   int cost_list[5];
+  struct buf_2d backup_pre[MAX_MB_PLANE] = { { 0, 0 } };
 
-  if (scaled_ref_frame)
+  if (scaled_ref_frame) {
     yv12 = scaled_ref_frame;
-  else
+    // As reported in b/311294795, the reference buffer pointer needs to be
+    // saved and restored after the search. Otherwise, it causes problems while
+    // the reference frame scaling happens.
+    for (int i = 0; i < MAX_MB_PLANE; i++) backup_pre[i] = xd->plane[i].pre[0];
+  } else {
     yv12 = get_ref_frame_buffer(cpi, ref);
+  }
 
   assert(yv12 != NULL);
   if (!yv12) return;
-  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
-                       &cm->frame_refs[ref - 1].sf);
+  vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, NULL);
   mi->ref_frame[0] = ref;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->sb_type = bsize;
   vp9_set_mv_search_range(&x->mv_limits, &ref_mv);
   vp9_full_pixel_search(cpi, x, bsize, &ref_mv_full, step_param, search_method,
@@ -3444,6 +3470,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   x->mv_limits = tmp_mv_limits;
   mi->mv[0].as_mv = best_mv;
 
+  // Restore reference buffer pointer.
+  if (scaled_ref_frame) {
+    for (int i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_pre[i];
+  }
+
   set_ref_ptrs(cm, xd, mi->ref_frame[0], mi->ref_frame[1]);
   xd->plane[0].dst.buf = pred_buf;
   xd->plane[0].dst.stride = 64;
@@ -3454,15 +3485,15 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
 // Features used: QP; spatial block size contexts; variance of prediction
 // residue after simple_motion_search.
 #define FEATURES 12
-static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
-                                          MACROBLOCK *const x,
-                                          PC_TREE *const pc_tree,
-                                          BLOCK_SIZE bsize, int mi_row,
-                                          int mi_col, int *none, int *split) {
+static void ml_predict_var_rd_partitioning(const VP9_COMP *const cpi,
+                                           MACROBLOCK *const x,
+                                           PC_TREE *const pc_tree,
+                                           BLOCK_SIZE bsize, int mi_row,
+                                           int mi_col, int *none, int *split) {
   const VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
+  const MACROBLOCKD *const xd = &x->e_mbd;
 #if CONFIG_VP9_HIGHBITDEPTH
-  MACROBLOCKD *xd = &x->e_mbd;
   DECLARE_ALIGNED(16, uint8_t, pred_buffer[64 * 64 * 2]);
   uint8_t *const pred_buf = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
                                 ? (CONVERT_TO_BYTEPTR(pred_buffer))
@@ -3545,7 +3576,6 @@ static void ml_predict_var_rd_paritioning(const VP9_COMP *const cpi,
       const unsigned int var =
           cpi->fn_ptr[bsize].vf(src, src_stride, pred, pred_stride, &sse);
       const float factor = (var == 0) ? 1.0f : (1.0f / (float)var);
-      const MACROBLOCKD *const xd = &x->e_mbd;
       const int has_above = !!xd->above_mi;
       const int has_left = !!xd->left_mi;
       const BLOCK_SIZE above_bsize = has_above ? xd->above_mi->sb_type : bsize;
@@ -3695,7 +3725,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
   int row, col;
 
   int dr = 0;
-  int count = 0;
   double r0, rk, beta;
 
   TplDepFrame *tpl_frame;
@@ -3719,8 +3748,6 @@ static int get_rdmult_delta(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
 
       intra_cost += this_stats->intra_cost;
       mc_dep_cost += this_stats->mc_dep_cost;
-
-      ++count;
     }
   }
 
@@ -3777,7 +3804,7 @@ static void assign_motion_vector_info(const int block_width_4x4,
       const int col_4x4 = col_start_4x4 + j;
       const int unit_index = row_4x4 * num_unit_cols + col_4x4;
       if (row_4x4 >= num_unit_rows || col_4x4 >= num_unit_cols) continue;
-      if (source_ref_frame[1] == NONE) {
+      if (source_ref_frame[1] == NO_REF_FRAME) {
         assert(source_mv[1]->row == 0 && source_mv[1]->col == 0);
       }
       motion_vector_info[unit_index].ref_frame[0] = source_ref_frame[0];
@@ -4080,8 +4107,8 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
         mi_row + num_8x8_blocks_high_lookup[bsize] <= cm->mi_rows &&
         mi_col + num_8x8_blocks_wide_lookup[bsize] <= cm->mi_cols;
     if (do_rd_ml_partition_var_pruning) {
-      ml_predict_var_rd_paritioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
-                                    &partition_none_allowed, &do_split);
+      ml_predict_var_rd_partitioning(cpi, x, pc_tree, bsize, mi_row, mi_col,
+                                     &partition_none_allowed, &do_split);
     } else {
       vp9_zero(pc_tree->mv);
     }
@@ -4330,9 +4357,9 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
 
     if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
-      PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
-      update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
-      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
+      PICK_MODE_CONTEXT *hctx = &pc_tree->horizontal[0];
+      update_state(cpi, td, hctx, mi_row, mi_col, subsize, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, hctx);
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter = pred_interp_filter;
@@ -4407,12 +4434,31 @@ static int rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
   }
 
+  if (bsize == BLOCK_64X64 && best_rdc.rdcost == INT64_MAX) {
+    vp9_rd_cost_reset(&this_rdc);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc, BLOCK_64X64,
+                     ctx, INT_MAX, INT64_MAX);
+    ctx->rdcost = this_rdc.rdcost;
+    vp9_rd_cost_update(partition_mul, x->rddiv, &this_rdc);
+    if (this_rdc.rdcost < best_rdc.rdcost) {
+      best_rdc = this_rdc;
+      should_encode_sb = 1;
+      pc_tree->partitioning = PARTITION_NONE;
+    }
+  }
+
   *rd_cost = best_rdc;
 
   if (should_encode_sb && pc_tree->index != 3) {
     int output_enabled = (bsize == BLOCK_64X64);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_sb_time);
+#endif
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_sb_time);
+#endif
 #if CONFIG_RATE_CTRL
     if (oxcf->use_simple_encode_api) {
       // Store partition, motion vector of the superblock.
@@ -4539,8 +4585,15 @@ static void encode_rd_sb_row(VP9_COMP *cpi, ThreadData *td,
                                 &x->min_partition_size, &x->max_partition_size);
       }
       td->pc_root->none.rdcost = 0;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, rd_pick_partition_time);
+#endif
       rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
                         &dummy_rdc, dummy_rdc, td->pc_root);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, rd_pick_partition_time);
+#endif
     }
     (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, sb_row,
                                     sb_col_in_tile, num_sb_cols);
@@ -4672,6 +4725,8 @@ static void nonrd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
 
   set_segment_index(cpi, x, mi_row, mi_col, bsize, 0);
 
+  x->skip_recode = 0;
+
   mi = xd->mi[0];
   mi->sb_type = bsize;
 
@@ -4795,9 +4850,9 @@ static void pred_pixel_ready_reset(PC_TREE *pc_tree, BLOCK_SIZE bsize) {
 
 #define FEATURES 6
 #define LABELS 2
-static int ml_predict_var_paritioning(VP9_COMP *cpi, MACROBLOCK *x,
-                                      BLOCK_SIZE bsize, int mi_row,
-                                      int mi_col) {
+static int ml_predict_var_partitioning(VP9_COMP *cpi, MACROBLOCK *x,
+                                       BLOCK_SIZE bsize, int mi_row,
+                                       int mi_col) {
   VP9_COMMON *const cm = &cpi->common;
   const NN_CONFIG *nn_config = NULL;
 
@@ -4929,7 +4984,7 @@ static void nonrd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     if (partition_none_allowed || do_split) do_rect = 0;
     if (partition_none_allowed && do_split) {
       const int ml_predicted_partition =
-          ml_predict_var_paritioning(cpi, x, bsize, mi_row, mi_col);
+          ml_predict_var_partitioning(cpi, x, bsize, mi_row, mi_col);
       if (ml_predicted_partition == PARTITION_NONE) do_split = 0;
       if (ml_predicted_partition == PARTITION_SPLIT) partition_none_allowed = 0;
     }
@@ -5418,7 +5473,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
                            &cm->frame_refs[LAST_FRAME - 1].sf);
       mi->ref_frame[0] = LAST_FRAME;
     }
-    mi->ref_frame[1] = NONE;
+    mi->ref_frame[1] = NO_REF_FRAME;
     mi->sb_type = BLOCK_64X64;
     mi->mv[0].as_int = 0;
     mi->interp_filter = BILINEAR;
@@ -5608,7 +5663,7 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
         if ((cpi->oxcf.rc_mode == VPX_VBR && cpi->rc.high_source_sad &&
              cpi->oxcf.speed < 6 && !frame_is_intra_only(cm) &&
              (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
-          // Use lower max_partition_size for low resoultions.
+          // Use lower max_partition_size for low resolutions.
           if (cm->width <= 352 && cm->height <= 288)
             x->max_partition_size = BLOCK_32X32;
           else
@@ -5650,12 +5705,12 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
 }
 // end RTC play code
 
-static INLINE uint32_t variance(const diff *const d) {
+static INLINE uint32_t variance(const Diff *const d) {
   return d->sse - (uint32_t)(((int64_t)d->sum * d->sum) >> 8);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE uint32_t variance_highbd(diff *const d) {
+static INLINE uint32_t variance_highbd(Diff *const d) {
   const int64_t var = (int64_t)d->sse - (((int64_t)d->sum * d->sum) >> 8);
   return (var >= 0) ? (uint32_t)var : 0;
 }
@@ -5675,7 +5730,7 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
                          ? (cm->MBs * VAR_HIST_LARGE_CUT_OFF / 100)
                          : (cm->MBs * VAR_HIST_SMALL_CUT_OFF / 100);
   DECLARE_ALIGNED(16, int, hist[VAR_HIST_BINS]);
-  diff *var16 = cpi->source_diff_var;
+  Diff *var16 = cpi->source_diff_var;
 
   int sum = 0;
   int i, j;
@@ -5758,8 +5813,8 @@ static void source_var_based_partition_search_method(VP9_COMP *cpi) {
     if (cm->last_width != cm->width || cm->last_height != cm->height) {
       if (cpi->source_diff_var) vpx_free(cpi->source_diff_var);
 
-      CHECK_MEM_ERROR(cm, cpi->source_diff_var,
-                      vpx_calloc(cm->MBs, sizeof(diff)));
+      CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+                      vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
     }
 
     if (!cpi->frames_till_next_var_check)
@@ -5798,7 +5853,7 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
     CHECK_MEM_ERROR(
-        cm, cpi->tile_data,
+        &cm->error, cpi->tile_data,
         vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
     cpi->allocated_tiles = tile_cols * tile_rows;
 
@@ -5807,20 +5862,15 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
         TileDataEnc *tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
+        const MV zero_mv = { 0, 0 };
         for (i = 0; i < BLOCK_SIZES; ++i) {
           for (j = 0; j < MAX_MODES; ++j) {
             tile_data->thresh_freq_fact[i][j] = RD_THRESH_INIT_FACT;
-#if CONFIG_RATE_CTRL
-            if (cpi->oxcf.use_simple_encode_api) {
-              tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-            }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
             tile_data->thresh_freq_fact_prev[i][j] = RD_THRESH_INIT_FACT;
-#endif  // CONFIG_CONSISTENT_RECODE
             tile_data->mode_map[i][j] = j;
           }
         }
+        tile_data->firstpass_top_mv = zero_mv;
 #if CONFIG_MULTITHREAD
         tile_data->row_base_thresh_freq_fact = NULL;
 #endif
@@ -6037,9 +6087,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
   x->fwd_txfm4x4 = xd->lossless ? vp9_fwht4x4 : vpx_fdct4x4;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   x->inv_txfm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
-#if CONFIG_CONSISTENT_RECODE
   x->optimize = sf->optimize_coefficients == 1 && cpi->oxcf.pass != 1;
-#endif
   if (xd->lossless) x->optimize = 0;
   x->sharpness = cpi->oxcf.sharpness;
   x->adjust_rdmult_by_segment = (cpi->oxcf.aq_mode == VARIANCE_AQ);
@@ -6108,6 +6156,15 @@ static void encode_frame_internal(VP9_COMP *cpi) {
       cpi->rd.r0 = (double)intra_cost_base / mc_dep_cost_base;
   }
 
+  for (MV_REFERENCE_FRAME ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME;
+       ++ref_frame) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
+      if (cm->frame_refs[ref_frame - 1].sf.x_scale_fp == REF_INVALID_SCALE ||
+          cm->frame_refs[ref_frame - 1].sf.y_scale_fp == REF_INVALID_SCALE)
+        cpi->ref_frame_flags &= ~ref_frame_to_flag(ref_frame);
+    }
+  }
+
   // Frame segmentation
   if (cpi->oxcf.aq_mode == PERCEPTUAL_AQ) build_kmeans_segmentation(cpi);
 
@@ -6166,7 +6223,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
 
   int mi_row, mi_col;
   int sum_delta = 0;
-  int map_index = 0;
   int qdelta_index;
   int segment_id;
 
@@ -6176,7 +6232,6 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
       segment_id = mi_8x8[0]->segment_id;
       qdelta_index = get_segdata(seg, segment_id, SEG_LVL_ALT_Q);
       sum_delta += qdelta_index;
-      map_index++;
     }
     mi_8x8_ptr += cm->mi_stride;
   }
@@ -6184,13 +6239,11 @@ static int compute_frame_aq_offset(struct VP9_COMP *cpi) {
   return sum_delta / (cm->mi_rows * cm->mi_cols);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void restore_encode_params(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -6201,35 +6254,19 @@ static void restore_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes[i][j] = rd_opt->filter_threshes_prev[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact[i][j] =
-                tile_data->thresh_freq_fact_prev[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact, tile_data->thresh_freq_fact_prev);
   }
 
   cm->interp_filter = cpi->sf.default_interp_filter;
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 void vp9_encode_frame(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
 
-#if CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    restore_encode_params(cpi);
-  }
-#endif  // CONFIG_RATE_CTRL
-#if CONFIG_CONSISTENT_RECODE
   restore_encode_params(cpi);
-#endif
 
 #if CONFIG_MISMATCH_DEBUG
   mismatch_reset_frame(MAX_MB_PLANE);
@@ -6283,7 +6320,13 @@ void vp9_encode_frame(VP9_COMP *cpi) {
     if (cm->interp_filter == SWITCHABLE)
       cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_frame_internal_time);
+#endif
     encode_frame_internal(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_frame_internal_time);
+#endif
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9dc..eded9f5c4 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -26,6 +26,7 @@
 #include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
@@ -78,7 +79,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
   const int shift = (tx_size == TX_32X32);
   const int16_t *const dequant_ptr = pd->dequant;
   const uint8_t *const band_translate = get_band_translate(tx_size);
-  const scan_order *const so = get_scan(xd, tx_size, plane_type, block);
+  const ScanOrder *const so = get_scan(xd, tx_size, plane_type, block);
   const int16_t *const scan = so->scan;
   const int16_t *const nb = so->neighbors;
   const MODE_INFO *mbmi = xd->mi[0];
@@ -350,7 +351,7 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -366,28 +367,24 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
-                                     qcoeff, dqcoeff, pd->dequant, eob,
-                                     scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff,
+                                     pd->dequant, eob, scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
-                               dqcoeff, pd->dequant, eob, scan_order->scan,
-                               scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                               scan_order);
         break;
     }
     return;
@@ -397,26 +394,25 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
 
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
-                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                      scan_order);
       break;
   }
 }
@@ -495,7 +491,7 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -511,28 +507,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(
-            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                    scan_order);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order);
         break;
     }
     return;
@@ -542,28 +534,24 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                           scan_order->scan, scan_order->iscan);
+      vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                           scan_order);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                     scan_order->iscan);
+      vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                     scan_order);
       break;
   }
 }
@@ -759,10 +747,23 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
   MODE_INFO *mi = xd->mi[0];
   int plane;
 #if CONFIG_MISMATCH_DEBUG
-  struct encode_b_args arg = { x,         1,      NULL,   NULL,
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
                                &mi->skip, mi_row, mi_col, output_enabled };
 #else
-  struct encode_b_args arg = { x, 1, NULL, NULL, &mi->skip };
+  struct encode_b_args arg = { x,
+                               1,     // enable_trellis_opt
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               NULL,  // above entropy context
+                               NULL,  // left entropy context
+                               &mi->skip };
   (void)mi_row;
   (void)mi_col;
   (void)output_enabled;
@@ -780,9 +781,9 @@ void vp9_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, int mi_col,
       const TX_SIZE tx_size = plane ? get_uv_tx_size(mi, pd) : mi->tx_size;
       vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane],
                                ctx.tl[plane]);
-      arg.enable_coeff_opt = 1;
+      arg.enable_trellis_opt = 1;
     } else {
-      arg.enable_coeff_opt = 0;
+      arg.enable_trellis_opt = 0;
     }
     arg.ta = ctx.ta[plane];
     arg.tl = ctx.tl[plane];
@@ -804,7 +805,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const scan_order *scan_order;
+  const ScanOrder *scan_order;
   TX_TYPE tx_type = DCT_DCT;
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
@@ -814,17 +815,13 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  int enable_trellis_opt = !x->skip_recode;
   ENTROPY_CONTEXT *a = NULL;
   ENTROPY_CONTEXT *l = NULL;
   int entropy_ctx = 0;
   dst = &pd->dst.buf[4 * (row * dst_stride + col)];
   src = &p->src.buf[4 * (row * src_stride + col)];
   src_diff = &p->src_diff[4 * (row * diff_stride + col)];
-  if (args->enable_coeff_opt) {
-    a = &args->ta[col];
-    l = &args->tl[row];
-    entropy_ctx = combine_entropy_contexts(*a, *l);
-  }
 
   if (tx_size == TX_4X4) {
     tx_type = get_tx_type_4x4(get_plane_type(plane), xd, block);
@@ -848,20 +845,42 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   // skip block condition should be handled before this is called.
   assert(!x->skip_block);
 
+  if (!x->skip_recode) {
+    const int tx_size_in_pixels = (1 << tx_size) << 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                                diff_stride, src, src_stride, dst, dst_stride,
+                                xd->bd);
+    } else {
+      vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                         diff_stride, src, src_stride, dst, dst_stride);
+    }
+#else
+    vpx_subtract_block(tx_size_in_pixels, tx_size_in_pixels, src_diff,
+                       diff_stride, src, src_stride, dst, dst_stride);
+#endif
+    enable_trellis_opt = do_trellis_opt(pd, src_diff, diff_stride, row, col,
+                                        plane_bsize, tx_size, args);
+  }
+
+  if (enable_trellis_opt) {
+    a = &args->ta[col];
+    l = &args->tl[row];
+    entropy_ctx = combine_entropy_contexts(*a, *l);
+  }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
     switch (tx_size) {
       case TX_32X32:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(
-              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant,
+                                      eob, scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -870,17 +889,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_16X16:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -890,17 +906,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
         break;
       case TX_8X8:
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type == DCT_DCT)
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -911,17 +924,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       default:
         assert(tx_size == TX_4X4);
         if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
-                                    src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                                scan_order);
         }
-        if (args->enable_coeff_opt && !x->skip_recode) {
+        if (enable_trellis_opt) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
         }
         if (!x->skip_encode && *eob) {
@@ -945,14 +955,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
   switch (tx_size) {
     case TX_32X32:
       if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
-                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                             scan_order->scan, scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, p, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -960,14 +967,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_16X16:
       if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -975,14 +979,11 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
       break;
     case TX_8X8:
       if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob)
@@ -991,17 +992,14 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     default:
       assert(tx_size == TX_4X4);
       if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
-                           dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
-                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                       scan_order);
       }
-      if (args->enable_coeff_opt && !x->skip_recode) {
+      if (enable_trellis_opt) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
       }
       if (!x->skip_encode && *eob) {
@@ -1019,28 +1017,43 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
 }
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b) {
+                                  int enable_trellis_opt) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
 #if CONFIG_MISMATCH_DEBUG
   // TODO(angiebird): make mismatch_debug support intra mode
   struct encode_b_args arg = {
-    x, enable_optimize_b, ctx.ta[plane], ctx.tl[plane], &xd->mi[0]->skip, 0, 0,
-    0
+    x,
+    enable_trellis_opt,
+    0.0,   // trellis_opt_thresh
+    NULL,  // &sse_calc_done
+    NULL,  // &sse
+    ctx.ta[plane],
+    ctx.tl[plane],
+    &xd->mi[0]->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
   };
 #else
-  struct encode_b_args arg = { x, enable_optimize_b, ctx.ta[plane],
-                               ctx.tl[plane], &xd->mi[0]->skip };
+  struct encode_b_args arg = { x,
+                               enable_trellis_opt,
+                               0.0,   // trellis_opt_thresh
+                               NULL,  // &sse_calc_done
+                               NULL,  // &sse
+                               ctx.ta[plane],
+                               ctx.tl[plane],
+                               &xd->mi[0]->skip };
 #endif
 
-  if (enable_optimize_b && x->optimize &&
+  if (enable_trellis_opt && x->optimize &&
       (!x->skip_recode || !x->skip_optimize)) {
     const struct macroblockd_plane *const pd = &xd->plane[plane];
     const TX_SIZE tx_size =
         plane ? get_uv_tx_size(xd->mi[0], pd) : xd->mi[0]->tx_size;
     vp9_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
   } else {
-    arg.enable_coeff_opt = 0;
+    arg.enable_trellis_opt = 0;
   }
 
   vp9_foreach_transformed_block_in_plane(xd, bsize, plane,
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 1975ee73a..1391446be 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -20,7 +20,10 @@ extern "C" {
 
 struct encode_b_args {
   MACROBLOCK *x;
-  int enable_coeff_opt;
+  int enable_trellis_opt;
+  double trellis_opt_thresh;
+  int *sse_calc_done;
+  int64_t *sse;
   ENTROPY_CONTEXT *ta;
   ENTROPY_CONTEXT *tl;
   int8_t *skip;
@@ -48,7 +51,7 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
                             BLOCK_SIZE plane_bsize, TX_SIZE tx_size, void *arg);
 
 void vp9_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
-                                  int enable_optimize_b);
+                                  int enable_trellis_opt);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index e38507754..152d42bc9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
@@ -23,6 +24,7 @@
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
+#include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_ports/vpx_once.h"
@@ -32,18 +34,15 @@
 #endif  // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 
 #include "vp9/common/vp9_alloccommon.h"
+#include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_idct.h"
-#if CONFIG_NON_GREEDY_MV
-#include "vp9/common/vp9_mvref_common.h"
-#endif
 #if CONFIG_VP9_POSTPROC
 #include "vp9/common/vp9_postproc.h"
 #endif
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 #include "vp9/common/vp9_tile_common.h"
-#include "vp9/common/vp9_scan.h"
 
 #if !CONFIG_REALTIME_ONLY
 #include "vp9/encoder/vp9_alt_ref_aq.h"
@@ -81,8 +80,11 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
+#include "vp9/encoder/vp9_tpl_model.h"
 #include "vp9/vp9_cx_iface.h"
 
+#include "vpx/vpx_ext_ratectrl.h"
+
 #define AM_SEGMENT_ID_INACTIVE 7
 #define AM_SEGMENT_ID_ACTIVE 0
 
@@ -126,13 +128,6 @@ static int is_spatial_denoise_enabled(VP9_COMP *cpi) {
 }
 #endif
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size);
-#endif
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size);
-
 #if !CONFIG_REALTIME_ONLY
 // compute adaptive threshold for skip recoding
 static int compute_context_model_thresh(const VP9_COMP *const cpi) {
@@ -148,7 +143,7 @@ static int compute_context_model_thresh(const VP9_COMP *const cpi) {
   // frame context probability model is less than a certain threshold.
   // The first component is the most critical part to guarantee adaptivity.
   // Other parameters are estimated based on normal setting of hd resolution
-  // parameters. e.g frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
+  // parameters. e.g. frame_size = 1920x1080, bitrate = 8000, qindex_factor < 50
   const int thresh =
       ((FRAME_SIZE_FACTOR * frame_size - FRAME_RATE_FACTOR * bitrate) *
        qindex_factor) >>
@@ -502,22 +497,22 @@ static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = {
   "Too many reference buffers are used."
 };
 
-static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
+static INLINE void Scale2Ratio(VPX_SCALING_MODE mode, int *hr, int *hs) {
   switch (mode) {
-    case NORMAL:
+    case VP8E_NORMAL:
       *hr = 1;
       *hs = 1;
       break;
-    case FOURFIVE:
+    case VP8E_FOURFIVE:
       *hr = 4;
       *hs = 5;
       break;
-    case THREEFIVE:
+    case VP8E_THREEFIVE:
       *hr = 3;
       *hs = 5;
       break;
     default:
-      assert(mode == ONETWO);
+      assert(mode == VP8E_ONETWO);
       *hr = 1;
       *hs = 2;
       break;
@@ -690,9 +685,10 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
   return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
 }
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]) {
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]) {
   VP9_COMMON *cm = &cpi->common;
   vpx_roi_map_t *roi = &cpi->roi;
   const int range = 63;
@@ -703,13 +699,13 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
 
   // Check number of rows and columns match
   if (frame_rows != (int)rows || frame_cols != (int)cols) {
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
   }
 
   if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
       !check_seg_range(ref_frame, ref_frame_range) ||
       !check_seg_range(skip, skip_range))
-    return -1;
+    return VPX_CODEC_INVALID_PARAM;
 
   // Also disable segmentation if no deltas are specified.
   if (!map ||
@@ -723,14 +719,15 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
         ref_frame[6] == -1 && ref_frame[7] == -1))) {
     vp9_disable_segmentation(&cm->seg);
     cpi->roi.enabled = 0;
-    return 0;
+    return VPX_CODEC_OK;
   }
 
   if (roi->roi_map) {
     vpx_free(roi->roi_map);
     roi->roi_map = NULL;
   }
-  CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+  roi->roi_map = vpx_malloc(rows * cols);
+  if (!roi->roi_map) return VPX_CODEC_MEM_ERROR;
 
   // Copy to ROI structure in the compressor.
   memcpy(roi->roi_map, map, rows * cols);
@@ -742,7 +739,7 @@ int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
   roi->rows = rows;
   roi->cols = cols;
 
-  return 0;
+  return VPX_CODEC_OK;
 }
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
@@ -886,10 +883,11 @@ static int vp9_enc_alloc_mi(VP9_COMMON *cm, int mi_size) {
   if (!cm->prev_mip) return 1;
   cm->mi_alloc_size = mi_size;
 
-  cm->mi_grid_base = (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+  cm->mi_grid_base =
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
   if (!cm->mi_grid_base) return 1;
   cm->prev_mi_grid_base =
-      (MODE_INFO **)vpx_calloc(mi_size, sizeof(MODE_INFO *));
+      (MODE_INFO **)vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
   if (!cm->prev_mi_grid_base) return 1;
 
   return 0;
@@ -1383,7 +1381,7 @@ static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+  CHECK_MEM_ERROR(&cm->error, cpi->mbmi_ext_base,
                   vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
@@ -1402,14 +1400,14 @@ static void alloc_compressor_data(VP9_COMP *cpi) {
 
   {
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
-    CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
+    CHECK_MEM_ERROR(&cm->error, cpi->tile_tok[0][0],
                     vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
   }
 
   sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
   vpx_free(cpi->tplist[0][0]);
   CHECK_MEM_ERROR(
-      cm, cpi->tplist[0][0],
+      &cm->error, cpi->tplist[0][0],
       vpx_calloc(sb_rows * 4 * (1 << 6), sizeof(*cpi->tplist[0][0])));
 
   vp9_setup_pc_tree(&cpi->common, &cpi->td);
@@ -1571,13 +1569,15 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
+#define HIGHBD_BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                            \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                          \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                          \
+  cpi->fn_ptr[BT].vf = VF;                                              \
+  cpi->fn_ptr[BT].svf = SVF;                                            \
+  cpi->fn_ptr[BT].svaf = SVAF;                                          \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                                      \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -1637,284 +1637,361 @@ void vp9_set_rc_buffer_sizes(VP9_COMP *cpi) {
   }
 
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_32x32)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_32x32x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_64x64)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_64x64x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_16x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_16x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x16x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_8x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_8x4x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x8)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x8x4d)
+
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad_skip_4x4)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad_skip_4x4x4d)
 
 static void highbd_set_var_fns(VP9_COMP *const cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
     switch (cm->bit_depth) {
       case VPX_BITS_8:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits8,
-                   vpx_highbd_sad32x16_avg_bits8, vpx_highbd_8_variance32x16,
-                   vpx_highbd_8_sub_pixel_variance32x16,
-                   vpx_highbd_8_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits8,
-                   vpx_highbd_sad16x32_avg_bits8, vpx_highbd_8_variance16x32,
-                   vpx_highbd_8_sub_pixel_variance16x32,
-                   vpx_highbd_8_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits8,
-                   vpx_highbd_sad64x32_avg_bits8, vpx_highbd_8_variance64x32,
-                   vpx_highbd_8_sub_pixel_variance64x32,
-                   vpx_highbd_8_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits8,
-                   vpx_highbd_sad32x64_avg_bits8, vpx_highbd_8_variance32x64,
-                   vpx_highbd_8_sub_pixel_variance32x64,
-                   vpx_highbd_8_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits8,
-                   vpx_highbd_sad32x32_avg_bits8, vpx_highbd_8_variance32x32,
-                   vpx_highbd_8_sub_pixel_variance32x32,
-                   vpx_highbd_8_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits8,
-                   vpx_highbd_sad64x64_avg_bits8, vpx_highbd_8_variance64x64,
-                   vpx_highbd_8_sub_pixel_variance64x64,
-                   vpx_highbd_8_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits8,
-                   vpx_highbd_sad16x16_avg_bits8, vpx_highbd_8_variance16x16,
-                   vpx_highbd_8_sub_pixel_variance16x16,
-                   vpx_highbd_8_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits8,
-                   vpx_highbd_sad16x8_avg_bits8, vpx_highbd_8_variance16x8,
-                   vpx_highbd_8_sub_pixel_variance16x8,
-                   vpx_highbd_8_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits8)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits8,
-                   vpx_highbd_sad8x16_avg_bits8, vpx_highbd_8_variance8x16,
-                   vpx_highbd_8_sub_pixel_variance8x16,
-                   vpx_highbd_8_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits8)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits8,
+            vpx_highbd_sad_skip_32x16_bits8, vpx_highbd_sad32x16_avg_bits8,
+            vpx_highbd_8_variance32x16, vpx_highbd_8_sub_pixel_variance32x16,
+            vpx_highbd_8_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits8, vpx_highbd_sad_skip_32x16x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits8,
+            vpx_highbd_sad_skip_16x32_bits8, vpx_highbd_sad16x32_avg_bits8,
+            vpx_highbd_8_variance16x32, vpx_highbd_8_sub_pixel_variance16x32,
+            vpx_highbd_8_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits8, vpx_highbd_sad_skip_16x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits8,
+            vpx_highbd_sad_skip_64x32_bits8, vpx_highbd_sad64x32_avg_bits8,
+            vpx_highbd_8_variance64x32, vpx_highbd_8_sub_pixel_variance64x32,
+            vpx_highbd_8_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits8, vpx_highbd_sad_skip_64x32x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits8,
+            vpx_highbd_sad_skip_32x64_bits8, vpx_highbd_sad32x64_avg_bits8,
+            vpx_highbd_8_variance32x64, vpx_highbd_8_sub_pixel_variance32x64,
+            vpx_highbd_8_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits8, vpx_highbd_sad_skip_32x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X8, vpx_highbd_sad8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
-            vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
-            vpx_highbd_8_sub_pixel_avg_variance8x8, vpx_highbd_sad8x8x4d_bits8)
+            BLOCK_32X32, vpx_highbd_sad32x32_bits8,
+            vpx_highbd_sad_skip_32x32_bits8, vpx_highbd_sad32x32_avg_bits8,
+            vpx_highbd_8_variance32x32, vpx_highbd_8_sub_pixel_variance32x32,
+            vpx_highbd_8_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits8, vpx_highbd_sad_skip_32x32x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_8X4, vpx_highbd_sad8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
-            vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
-            vpx_highbd_8_sub_pixel_avg_variance8x4, vpx_highbd_sad8x4x4d_bits8)
+            BLOCK_64X64, vpx_highbd_sad64x64_bits8,
+            vpx_highbd_sad_skip_64x64_bits8, vpx_highbd_sad64x64_avg_bits8,
+            vpx_highbd_8_variance64x64, vpx_highbd_8_sub_pixel_variance64x64,
+            vpx_highbd_8_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits8, vpx_highbd_sad_skip_64x64x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X8, vpx_highbd_sad4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
-            vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
-            vpx_highbd_8_sub_pixel_avg_variance4x8, vpx_highbd_sad4x8x4d_bits8)
+            BLOCK_16X16, vpx_highbd_sad16x16_bits8,
+            vpx_highbd_sad_skip_16x16_bits8, vpx_highbd_sad16x16_avg_bits8,
+            vpx_highbd_8_variance16x16, vpx_highbd_8_sub_pixel_variance16x16,
+            vpx_highbd_8_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits8, vpx_highbd_sad_skip_16x16x4d_bits8)
 
         HIGHBD_BFP(
-            BLOCK_4X4, vpx_highbd_sad4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
-            vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
-            vpx_highbd_8_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits8)
+            BLOCK_16X8, vpx_highbd_sad16x8_bits8,
+            vpx_highbd_sad_skip_16x8_bits8, vpx_highbd_sad16x8_avg_bits8,
+            vpx_highbd_8_variance16x8, vpx_highbd_8_sub_pixel_variance16x8,
+            vpx_highbd_8_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits8, vpx_highbd_sad_skip_16x8x4d_bits8)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits8,
+            vpx_highbd_sad_skip_8x16_bits8, vpx_highbd_sad8x16_avg_bits8,
+            vpx_highbd_8_variance8x16, vpx_highbd_8_sub_pixel_variance8x16,
+            vpx_highbd_8_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits8, vpx_highbd_sad_skip_8x16x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits8,
+                   vpx_highbd_sad_skip_8x8_bits8, vpx_highbd_sad8x8_avg_bits8,
+                   vpx_highbd_8_variance8x8, vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
+                   vpx_highbd_sad8x8x4d_bits8, vpx_highbd_sad_skip_8x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits8,
+                   vpx_highbd_sad_skip_8x4_bits8, vpx_highbd_sad8x4_avg_bits8,
+                   vpx_highbd_8_variance8x4, vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
+                   vpx_highbd_sad8x4x4d_bits8, vpx_highbd_sad_skip_8x4x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits8,
+                   vpx_highbd_sad_skip_4x8_bits8, vpx_highbd_sad4x8_avg_bits8,
+                   vpx_highbd_8_variance4x8, vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
+                   vpx_highbd_sad4x8x4d_bits8, vpx_highbd_sad_skip_4x8x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits8,
+                   vpx_highbd_sad_skip_4x4_bits8, vpx_highbd_sad4x4_avg_bits8,
+                   vpx_highbd_8_variance4x4, vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
+                   vpx_highbd_sad4x4x4d_bits8, vpx_highbd_sad_skip_4x4x4d_bits8)
         break;
 
       case VPX_BITS_10:
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits10,
-                   vpx_highbd_sad32x16_avg_bits10, vpx_highbd_10_variance32x16,
-                   vpx_highbd_10_sub_pixel_variance32x16,
-                   vpx_highbd_10_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits10,
-                   vpx_highbd_sad16x32_avg_bits10, vpx_highbd_10_variance16x32,
-                   vpx_highbd_10_sub_pixel_variance16x32,
-                   vpx_highbd_10_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits10,
-                   vpx_highbd_sad64x32_avg_bits10, vpx_highbd_10_variance64x32,
-                   vpx_highbd_10_sub_pixel_variance64x32,
-                   vpx_highbd_10_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits10,
-                   vpx_highbd_sad32x64_avg_bits10, vpx_highbd_10_variance32x64,
-                   vpx_highbd_10_sub_pixel_variance32x64,
-                   vpx_highbd_10_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits10,
-                   vpx_highbd_sad32x32_avg_bits10, vpx_highbd_10_variance32x32,
-                   vpx_highbd_10_sub_pixel_variance32x32,
-                   vpx_highbd_10_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits10,
-                   vpx_highbd_sad64x64_avg_bits10, vpx_highbd_10_variance64x64,
-                   vpx_highbd_10_sub_pixel_variance64x64,
-                   vpx_highbd_10_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits10,
-                   vpx_highbd_sad16x16_avg_bits10, vpx_highbd_10_variance16x16,
-                   vpx_highbd_10_sub_pixel_variance16x16,
-                   vpx_highbd_10_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits10,
-                   vpx_highbd_sad16x8_avg_bits10, vpx_highbd_10_variance16x8,
-                   vpx_highbd_10_sub_pixel_variance16x8,
-                   vpx_highbd_10_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits10,
-                   vpx_highbd_sad8x16_avg_bits10, vpx_highbd_10_variance8x16,
-                   vpx_highbd_10_sub_pixel_variance8x16,
-                   vpx_highbd_10_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits10,
-                   vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
-                   vpx_highbd_10_sub_pixel_variance8x8,
-                   vpx_highbd_10_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits10,
-                   vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
-                   vpx_highbd_10_sub_pixel_variance8x4,
-                   vpx_highbd_10_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits10,
-                   vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
-                   vpx_highbd_10_sub_pixel_variance4x8,
-                   vpx_highbd_10_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits10)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits10,
-                   vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
-                   vpx_highbd_10_sub_pixel_variance4x4,
-                   vpx_highbd_10_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits10)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits10,
+            vpx_highbd_sad_skip_32x16_bits10, vpx_highbd_sad32x16_avg_bits10,
+            vpx_highbd_10_variance32x16, vpx_highbd_10_sub_pixel_variance32x16,
+            vpx_highbd_10_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits10, vpx_highbd_sad_skip_32x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits10,
+            vpx_highbd_sad_skip_16x32_bits10, vpx_highbd_sad16x32_avg_bits10,
+            vpx_highbd_10_variance16x32, vpx_highbd_10_sub_pixel_variance16x32,
+            vpx_highbd_10_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits10, vpx_highbd_sad_skip_16x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits10,
+            vpx_highbd_sad_skip_64x32_bits10, vpx_highbd_sad64x32_avg_bits10,
+            vpx_highbd_10_variance64x32, vpx_highbd_10_sub_pixel_variance64x32,
+            vpx_highbd_10_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits10, vpx_highbd_sad_skip_64x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits10,
+            vpx_highbd_sad_skip_32x64_bits10, vpx_highbd_sad32x64_avg_bits10,
+            vpx_highbd_10_variance32x64, vpx_highbd_10_sub_pixel_variance32x64,
+            vpx_highbd_10_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits10, vpx_highbd_sad_skip_32x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits10,
+            vpx_highbd_sad_skip_32x32_bits10, vpx_highbd_sad32x32_avg_bits10,
+            vpx_highbd_10_variance32x32, vpx_highbd_10_sub_pixel_variance32x32,
+            vpx_highbd_10_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits10, vpx_highbd_sad_skip_32x32x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits10,
+            vpx_highbd_sad_skip_64x64_bits10, vpx_highbd_sad64x64_avg_bits10,
+            vpx_highbd_10_variance64x64, vpx_highbd_10_sub_pixel_variance64x64,
+            vpx_highbd_10_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits10, vpx_highbd_sad_skip_64x64x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits10,
+            vpx_highbd_sad_skip_16x16_bits10, vpx_highbd_sad16x16_avg_bits10,
+            vpx_highbd_10_variance16x16, vpx_highbd_10_sub_pixel_variance16x16,
+            vpx_highbd_10_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits10, vpx_highbd_sad_skip_16x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits10,
+            vpx_highbd_sad_skip_16x8_bits10, vpx_highbd_sad16x8_avg_bits10,
+            vpx_highbd_10_variance16x8, vpx_highbd_10_sub_pixel_variance16x8,
+            vpx_highbd_10_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits10, vpx_highbd_sad_skip_16x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits10,
+            vpx_highbd_sad_skip_8x16_bits10, vpx_highbd_sad8x16_avg_bits10,
+            vpx_highbd_10_variance8x16, vpx_highbd_10_sub_pixel_variance8x16,
+            vpx_highbd_10_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits10, vpx_highbd_sad_skip_8x16x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits10, vpx_highbd_sad_skip_8x8_bits10,
+            vpx_highbd_sad8x8_avg_bits10, vpx_highbd_10_variance8x8,
+            vpx_highbd_10_sub_pixel_variance8x8,
+            vpx_highbd_10_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits10, vpx_highbd_sad_skip_8x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits10, vpx_highbd_sad_skip_8x4_bits10,
+            vpx_highbd_sad8x4_avg_bits10, vpx_highbd_10_variance8x4,
+            vpx_highbd_10_sub_pixel_variance8x4,
+            vpx_highbd_10_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits10, vpx_highbd_sad_skip_8x4x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits10, vpx_highbd_sad_skip_4x8_bits10,
+            vpx_highbd_sad4x8_avg_bits10, vpx_highbd_10_variance4x8,
+            vpx_highbd_10_sub_pixel_variance4x8,
+            vpx_highbd_10_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits10, vpx_highbd_sad_skip_4x8x4d_bits10)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits10, vpx_highbd_sad_skip_4x4_bits10,
+            vpx_highbd_sad4x4_avg_bits10, vpx_highbd_10_variance4x4,
+            vpx_highbd_10_sub_pixel_variance4x4,
+            vpx_highbd_10_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits10, vpx_highbd_sad_skip_4x4x4d_bits10)
         break;
 
       default:
         assert(cm->bit_depth == VPX_BITS_12);
-        HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
-                   vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
-                   vpx_highbd_12_sub_pixel_variance32x16,
-                   vpx_highbd_12_sub_pixel_avg_variance32x16,
-                   vpx_highbd_sad32x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X32, vpx_highbd_sad16x32_bits12,
-                   vpx_highbd_sad16x32_avg_bits12, vpx_highbd_12_variance16x32,
-                   vpx_highbd_12_sub_pixel_variance16x32,
-                   vpx_highbd_12_sub_pixel_avg_variance16x32,
-                   vpx_highbd_sad16x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X32, vpx_highbd_sad64x32_bits12,
-                   vpx_highbd_sad64x32_avg_bits12, vpx_highbd_12_variance64x32,
-                   vpx_highbd_12_sub_pixel_variance64x32,
-                   vpx_highbd_12_sub_pixel_avg_variance64x32,
-                   vpx_highbd_sad64x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X64, vpx_highbd_sad32x64_bits12,
-                   vpx_highbd_sad32x64_avg_bits12, vpx_highbd_12_variance32x64,
-                   vpx_highbd_12_sub_pixel_variance32x64,
-                   vpx_highbd_12_sub_pixel_avg_variance32x64,
-                   vpx_highbd_sad32x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_32X32, vpx_highbd_sad32x32_bits12,
-                   vpx_highbd_sad32x32_avg_bits12, vpx_highbd_12_variance32x32,
-                   vpx_highbd_12_sub_pixel_variance32x32,
-                   vpx_highbd_12_sub_pixel_avg_variance32x32,
-                   vpx_highbd_sad32x32x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_64X64, vpx_highbd_sad64x64_bits12,
-                   vpx_highbd_sad64x64_avg_bits12, vpx_highbd_12_variance64x64,
-                   vpx_highbd_12_sub_pixel_variance64x64,
-                   vpx_highbd_12_sub_pixel_avg_variance64x64,
-                   vpx_highbd_sad64x64x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X16, vpx_highbd_sad16x16_bits12,
-                   vpx_highbd_sad16x16_avg_bits12, vpx_highbd_12_variance16x16,
-                   vpx_highbd_12_sub_pixel_variance16x16,
-                   vpx_highbd_12_sub_pixel_avg_variance16x16,
-                   vpx_highbd_sad16x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_16X8, vpx_highbd_sad16x8_bits12,
-                   vpx_highbd_sad16x8_avg_bits12, vpx_highbd_12_variance16x8,
-                   vpx_highbd_12_sub_pixel_variance16x8,
-                   vpx_highbd_12_sub_pixel_avg_variance16x8,
-                   vpx_highbd_sad16x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X16, vpx_highbd_sad8x16_bits12,
-                   vpx_highbd_sad8x16_avg_bits12, vpx_highbd_12_variance8x16,
-                   vpx_highbd_12_sub_pixel_variance8x16,
-                   vpx_highbd_12_sub_pixel_avg_variance8x16,
-                   vpx_highbd_sad8x16x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X8, vpx_highbd_sad8x8_bits12,
-                   vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
-                   vpx_highbd_12_sub_pixel_variance8x8,
-                   vpx_highbd_12_sub_pixel_avg_variance8x8,
-                   vpx_highbd_sad8x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_8X4, vpx_highbd_sad8x4_bits12,
-                   vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
-                   vpx_highbd_12_sub_pixel_variance8x4,
-                   vpx_highbd_12_sub_pixel_avg_variance8x4,
-                   vpx_highbd_sad8x4x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X8, vpx_highbd_sad4x8_bits12,
-                   vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
-                   vpx_highbd_12_sub_pixel_variance4x8,
-                   vpx_highbd_12_sub_pixel_avg_variance4x8,
-                   vpx_highbd_sad4x8x4d_bits12)
-
-        HIGHBD_BFP(BLOCK_4X4, vpx_highbd_sad4x4_bits12,
-                   vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
-                   vpx_highbd_12_sub_pixel_variance4x4,
-                   vpx_highbd_12_sub_pixel_avg_variance4x4,
-                   vpx_highbd_sad4x4x4d_bits12)
+        HIGHBD_BFP(
+            BLOCK_32X16, vpx_highbd_sad32x16_bits12,
+            vpx_highbd_sad_skip_32x16_bits12, vpx_highbd_sad32x16_avg_bits12,
+            vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16,
+            vpx_highbd_12_sub_pixel_avg_variance32x16,
+            vpx_highbd_sad32x16x4d_bits12, vpx_highbd_sad_skip_32x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X32, vpx_highbd_sad16x32_bits12,
+            vpx_highbd_sad_skip_16x32_bits12, vpx_highbd_sad16x32_avg_bits12,
+            vpx_highbd_12_variance16x32, vpx_highbd_12_sub_pixel_variance16x32,
+            vpx_highbd_12_sub_pixel_avg_variance16x32,
+            vpx_highbd_sad16x32x4d_bits12, vpx_highbd_sad_skip_16x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X32, vpx_highbd_sad64x32_bits12,
+            vpx_highbd_sad_skip_64x32_bits12, vpx_highbd_sad64x32_avg_bits12,
+            vpx_highbd_12_variance64x32, vpx_highbd_12_sub_pixel_variance64x32,
+            vpx_highbd_12_sub_pixel_avg_variance64x32,
+            vpx_highbd_sad64x32x4d_bits12, vpx_highbd_sad_skip_64x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X64, vpx_highbd_sad32x64_bits12,
+            vpx_highbd_sad_skip_32x64_bits12, vpx_highbd_sad32x64_avg_bits12,
+            vpx_highbd_12_variance32x64, vpx_highbd_12_sub_pixel_variance32x64,
+            vpx_highbd_12_sub_pixel_avg_variance32x64,
+            vpx_highbd_sad32x64x4d_bits12, vpx_highbd_sad_skip_32x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_32X32, vpx_highbd_sad32x32_bits12,
+            vpx_highbd_sad_skip_32x32_bits12, vpx_highbd_sad32x32_avg_bits12,
+            vpx_highbd_12_variance32x32, vpx_highbd_12_sub_pixel_variance32x32,
+            vpx_highbd_12_sub_pixel_avg_variance32x32,
+            vpx_highbd_sad32x32x4d_bits12, vpx_highbd_sad_skip_32x32x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_64X64, vpx_highbd_sad64x64_bits12,
+            vpx_highbd_sad_skip_64x64_bits12, vpx_highbd_sad64x64_avg_bits12,
+            vpx_highbd_12_variance64x64, vpx_highbd_12_sub_pixel_variance64x64,
+            vpx_highbd_12_sub_pixel_avg_variance64x64,
+            vpx_highbd_sad64x64x4d_bits12, vpx_highbd_sad_skip_64x64x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X16, vpx_highbd_sad16x16_bits12,
+            vpx_highbd_sad_skip_16x16_bits12, vpx_highbd_sad16x16_avg_bits12,
+            vpx_highbd_12_variance16x16, vpx_highbd_12_sub_pixel_variance16x16,
+            vpx_highbd_12_sub_pixel_avg_variance16x16,
+            vpx_highbd_sad16x16x4d_bits12, vpx_highbd_sad_skip_16x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_16X8, vpx_highbd_sad16x8_bits12,
+            vpx_highbd_sad_skip_16x8_bits12, vpx_highbd_sad16x8_avg_bits12,
+            vpx_highbd_12_variance16x8, vpx_highbd_12_sub_pixel_variance16x8,
+            vpx_highbd_12_sub_pixel_avg_variance16x8,
+            vpx_highbd_sad16x8x4d_bits12, vpx_highbd_sad_skip_16x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X16, vpx_highbd_sad8x16_bits12,
+            vpx_highbd_sad_skip_8x16_bits12, vpx_highbd_sad8x16_avg_bits12,
+            vpx_highbd_12_variance8x16, vpx_highbd_12_sub_pixel_variance8x16,
+            vpx_highbd_12_sub_pixel_avg_variance8x16,
+            vpx_highbd_sad8x16x4d_bits12, vpx_highbd_sad_skip_8x16x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X8, vpx_highbd_sad8x8_bits12, vpx_highbd_sad_skip_8x8_bits12,
+            vpx_highbd_sad8x8_avg_bits12, vpx_highbd_12_variance8x8,
+            vpx_highbd_12_sub_pixel_variance8x8,
+            vpx_highbd_12_sub_pixel_avg_variance8x8,
+            vpx_highbd_sad8x8x4d_bits12, vpx_highbd_sad_skip_8x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_8X4, vpx_highbd_sad8x4_bits12, vpx_highbd_sad_skip_8x4_bits12,
+            vpx_highbd_sad8x4_avg_bits12, vpx_highbd_12_variance8x4,
+            vpx_highbd_12_sub_pixel_variance8x4,
+            vpx_highbd_12_sub_pixel_avg_variance8x4,
+            vpx_highbd_sad8x4x4d_bits12, vpx_highbd_sad_skip_8x4x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X8, vpx_highbd_sad4x8_bits12, vpx_highbd_sad_skip_4x8_bits12,
+            vpx_highbd_sad4x8_avg_bits12, vpx_highbd_12_variance4x8,
+            vpx_highbd_12_sub_pixel_variance4x8,
+            vpx_highbd_12_sub_pixel_avg_variance4x8,
+            vpx_highbd_sad4x8x4d_bits12, vpx_highbd_sad_skip_4x8x4d_bits12)
+
+        HIGHBD_BFP(
+            BLOCK_4X4, vpx_highbd_sad4x4_bits12, vpx_highbd_sad_skip_4x4_bits12,
+            vpx_highbd_sad4x4_avg_bits12, vpx_highbd_12_variance4x4,
+            vpx_highbd_12_sub_pixel_variance4x4,
+            vpx_highbd_12_sub_pixel_avg_variance4x4,
+            vpx_highbd_sad4x4x4d_bits12, vpx_highbd_sad_skip_4x4x4d_bits12)
         break;
     }
   }
@@ -1926,48 +2003,48 @@ static void realloc_segmentation_maps(VP9_COMP *cpi) {
 
   // Create the encoder segmentation map and set all entries to 0
   vpx_free(cpi->segmentation_map);
-  CHECK_MEM_ERROR(cm, cpi->segmentation_map,
+  CHECK_MEM_ERROR(&cm->error, cpi->segmentation_map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // Create a map used for cyclic background refresh.
   if (cpi->cyclic_refresh) vp9_cyclic_refresh_free(cpi->cyclic_refresh);
-  CHECK_MEM_ERROR(cm, cpi->cyclic_refresh,
+  CHECK_MEM_ERROR(&cm->error, cpi->cyclic_refresh,
                   vp9_cyclic_refresh_alloc(cm->mi_rows, cm->mi_cols));
 
   // Create a map used to mark inactive areas.
   vpx_free(cpi->active_map.map);
-  CHECK_MEM_ERROR(cm, cpi->active_map.map,
+  CHECK_MEM_ERROR(&cm->error, cpi->active_map.map,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 
   // And a place holder structure is the coding context
   // for use if we want to save and restore it
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
-  CHECK_MEM_ERROR(cm, cpi->coding_context.last_frame_seg_map_copy,
+  CHECK_MEM_ERROR(&cm->error, cpi->coding_context.last_frame_seg_map_copy,
                   vpx_calloc(cm->mi_rows * cm->mi_cols, 1));
 }
 
 static void alloc_copy_partition_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   if (cpi->prev_partition == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_partition,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_partition,
                     (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                              sizeof(*cpi->prev_partition)));
   }
   if (cpi->prev_segment_id == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->prev_segment_id,
+        &cm->error, cpi->prev_segment_id,
         (int8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                              sizeof(*cpi->prev_segment_id)));
   }
   if (cpi->prev_variance_low == NULL) {
-    CHECK_MEM_ERROR(cm, cpi->prev_variance_low,
+    CHECK_MEM_ERROR(&cm->error, cpi->prev_variance_low,
                     (uint8_t *)vpx_calloc(
                         (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1) * 25,
                         sizeof(*cpi->prev_variance_low)));
   }
   if (cpi->copied_frame_cnt == NULL) {
     CHECK_MEM_ERROR(
-        cm, cpi->copied_frame_cnt,
+        &cm->error, cpi->copied_frame_cnt,
         (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                               sizeof(*cpi->copied_frame_cnt)));
   }
@@ -2085,13 +2162,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   if (last_w != cpi->oxcf.width || last_h != cpi->oxcf.height) {
     vpx_free(cpi->consec_zero_mv);
     CHECK_MEM_ERROR(
-        cm, cpi->consec_zero_mv,
+        &cm->error, cpi->consec_zero_mv,
         vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
     vpx_free(cpi->skin_map);
     CHECK_MEM_ERROR(
-        cm, cpi->skin_map,
-        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+        &cm->error, cpi->skin_map,
+        vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
 
     free_copy_partition_data(cpi);
     alloc_copy_partition_data(cpi);
@@ -2132,18 +2209,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   vp9_set_row_mt(cpi);
 }
 
-#ifndef M_LOG2_E
-#define M_LOG2_E 0.693147180559945309417
-#endif
-#define log2f(x) (log(x) / (float)M_LOG2_E)
-
 /***********************************************************************
  * Read before modifying 'cal_nmvjointsadcost' or 'cal_nmvsadcosts'    *
  ***********************************************************************
  * The following 2 functions ('cal_nmvjointsadcost' and                *
  * 'cal_nmvsadcosts') are used to calculate cost lookup tables         *
  * used by 'vp9_diamond_search_sad'. The C implementation of the       *
- * function is generic, but the AVX intrinsics optimised version       *
+ * function is generic, but the NEON intrinsics optimised version      *
  * relies on the following properties of the computed tables:          *
  * For cal_nmvjointsadcost:                                            *
  *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]     *
@@ -2152,7 +2224,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
  *         (Equal costs for both components)                           *
  *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                  *
  *         (Cost function is even)                                     *
- * If these do not hold, then the AVX optimised version of the         *
+ * If these do not hold, then the NEON optimised version of the        *
  * 'vp9_diamond_search_sad' function cannot be used as it is, in which *
  * case you can revert to using the C function instead.                *
  ***********************************************************************/
@@ -2310,7 +2382,7 @@ void vp9_update_compressor_with_img_fmt(VP9_COMP *cpi, vpx_img_fmt_t img_fmt) {
 VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
                                 BufferPool *const pool) {
   unsigned int i;
-  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(VP9_COMP));
+  VP9_COMP *volatile const cpi = vpx_memalign(32, sizeof(*cpi));
   VP9_COMMON *volatile const cm = cpi != NULL ? &cpi->common : NULL;
 
   if (!cm) return NULL;
@@ -2328,9 +2400,10 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   cm->free_mi = vp9_enc_free_mi;
   cm->setup_mi = vp9_enc_setup_mi;
 
-  CHECK_MEM_ERROR(cm, cm->fc, (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
+  CHECK_MEM_ERROR(&cm->error, cm->fc,
+                  (FRAME_CONTEXT *)vpx_calloc(1, sizeof(*cm->fc)));
   CHECK_MEM_ERROR(
-      cm, cm->frame_contexts,
+      &cm->error, cm->frame_contexts,
       (FRAME_CONTEXT *)vpx_calloc(FRAME_CONTEXTS, sizeof(*cm->frame_contexts)));
 
   cpi->compute_frame_low_motion_onepass = 1;
@@ -2357,38 +2430,38 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   realloc_segmentation_maps(cpi);
 
   CHECK_MEM_ERROR(
-      cm, cpi->skin_map,
-      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
+      &cm->error, cpi->skin_map,
+      vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->skin_map)));
 
 #if !CONFIG_REALTIME_ONLY
-  CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
+  CHECK_MEM_ERROR(&cm->error, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
 #endif
 
   CHECK_MEM_ERROR(
-      cm, cpi->consec_zero_mv,
+      &cm->error, cpi->consec_zero_mv,
       vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(*cpi->consec_zero_mv)));
 
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts_hp[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts[1])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[0],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[0])));
-  CHECK_MEM_ERROR(cm, cpi->nmvsadcosts_hp[1],
+  CHECK_MEM_ERROR(&cm->error, cpi->nmvsadcosts_hp[1],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvsadcosts_hp[1])));
 
   for (i = 0; i < (sizeof(cpi->mbgraph_stats) / sizeof(cpi->mbgraph_stats[0]));
        i++) {
     CHECK_MEM_ERROR(
-        cm, cpi->mbgraph_stats[i].mb_stats,
+        &cm->error, cpi->mbgraph_stats[i].mb_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->mbgraph_stats[i].mb_stats), 1));
   }
 
@@ -2432,7 +2505,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   }
 
   if (cpi->b_calculate_consistency) {
-    CHECK_MEM_ERROR(cm, cpi->ssim_vars,
+    CHECK_MEM_ERROR(&cm->error, cpi->ssim_vars,
                     vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                                sizeof(*cpi->ssim_vars) * 4));
     cpi->worst_consistency = 100.0;
@@ -2503,11 +2576,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         cpi->svc.number_temporal_layers > 1) {
       FIRSTPASS_STATS *const stats = oxcf->two_pass_stats_in.buf;
       FIRSTPASS_STATS *stats_copy[VPX_SS_MAX_LAYERS] = { 0 };
-      int i;
+      int n;
 
-      for (i = 0; i < oxcf->ss_number_layers; ++i) {
+      for (n = 0; n < oxcf->ss_number_layers; ++n) {
         FIRSTPASS_STATS *const last_packet_for_layer =
-            &stats[packets - oxcf->ss_number_layers + i];
+            &stats[packets - oxcf->ss_number_layers + n];
         const int layer_id = (int)last_packet_for_layer->spatial_layer_id;
         const int packets_in_layer = (int)last_packet_for_layer->count + 1;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers) {
@@ -2517,7 +2590,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
           vpx_free(lc->rc_twopass_stats_in.buf);
 
           lc->rc_twopass_stats_in.sz = packets_in_layer * packet_sz;
-          CHECK_MEM_ERROR(cm, lc->rc_twopass_stats_in.buf,
+          CHECK_MEM_ERROR(&cm->error, lc->rc_twopass_stats_in.buf,
                           vpx_malloc(lc->rc_twopass_stats_in.sz));
           lc->twopass.stats_in_start = lc->rc_twopass_stats_in.buf;
           lc->twopass.stats_in = lc->twopass.stats_in_start;
@@ -2532,11 +2605,11 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
         }
       }
 
-      for (i = 0; i < packets; ++i) {
-        const int layer_id = (int)stats[i].spatial_layer_id;
+      for (n = 0; n < packets; ++n) {
+        const int layer_id = (int)stats[n].spatial_layer_id;
         if (layer_id >= 0 && layer_id < oxcf->ss_number_layers &&
             stats_copy[layer_id] != NULL) {
-          *stats_copy[layer_id] = stats[i];
+          *stats_copy[layer_id] = stats[n];
           ++stats_copy[layer_id];
         }
       }
@@ -2572,7 +2645,7 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
     const int h = num_8x8_blocks_high_lookup[bsize];
     const int num_cols = (cm->mi_cols + w - 1) / w;
     const int num_rows = (cm->mi_rows + h - 1) / h;
-    CHECK_MEM_ERROR(cm, cpi->mi_ssim_rdmult_scaling_factors,
+    CHECK_MEM_ERROR(&cm->error, cpi->mi_ssim_rdmult_scaling_factors,
                     vpx_calloc(num_rows * num_cols,
                                sizeof(*cpi->mi_ssim_rdmult_scaling_factors)));
   }
@@ -2581,67 +2654,76 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
 #if CONFIG_NON_GREEDY_MV
   cpi->tpl_ready = 0;
 #endif  // CONFIG_NON_GREEDY_MV
-  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  for (i = 0; i < MAX_ARF_GOP_SIZE; ++i) {
+    cpi->tpl_stats[i].tpl_stats_ptr = NULL;
+  }
 
   // Allocate memory to store variances for a frame.
-  CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
+  CHECK_MEM_ERROR(&cm->error, cpi->source_diff_var,
+                  vpx_calloc(cm->MBs, sizeof(cpi->source_diff_var)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
-  cpi->fn_ptr[BT].sdf = SDF;                      \
-  cpi->fn_ptr[BT].sdaf = SDAF;                    \
-  cpi->fn_ptr[BT].vf = VF;                        \
-  cpi->fn_ptr[BT].svf = SVF;                      \
-  cpi->fn_ptr[BT].svaf = SVAF;                    \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;
-
-  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
-      vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d)
-
-  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
-      vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d)
-
-  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
-      vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d)
-
-  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
-      vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d)
-
-  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
-      vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d)
-
-  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
-      vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d)
-
-  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
-      vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d)
-
-  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
-      vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d)
-
-  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
-      vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d)
-
-  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
-
-  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
-
-  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
-
-  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
+#define BFP(BT, SDF, SDSF, SDAF, VF, SVF, SVAF, SDX4DF, SDSX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                                     \
+  cpi->fn_ptr[BT].sdsf = SDSF;                                   \
+  cpi->fn_ptr[BT].sdaf = SDAF;                                   \
+  cpi->fn_ptr[BT].vf = VF;                                       \
+  cpi->fn_ptr[BT].svf = SVF;                                     \
+  cpi->fn_ptr[BT].svaf = SVAF;                                   \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;                               \
+  cpi->fn_ptr[BT].sdsx4df = SDSX4DF;
+
+  BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad_skip_32x16, vpx_sad32x16_avg,
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, vpx_sad32x16x4d, vpx_sad_skip_32x16x4d)
+
+  BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad_skip_16x32, vpx_sad16x32_avg,
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, vpx_sad16x32x4d, vpx_sad_skip_16x32x4d)
+
+  BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad_skip_64x32, vpx_sad64x32_avg,
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, vpx_sad64x32x4d, vpx_sad_skip_64x32x4d)
+
+  BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad_skip_32x64, vpx_sad32x64_avg,
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, vpx_sad32x64x4d, vpx_sad_skip_32x64x4d)
+
+  BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad_skip_32x32, vpx_sad32x32_avg,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x4d, vpx_sad_skip_32x32x4d)
+
+  BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad_skip_64x64, vpx_sad64x64_avg,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x4d, vpx_sad_skip_64x64x4d)
+
+  BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad_skip_16x16, vpx_sad16x16_avg,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x4d, vpx_sad_skip_16x16x4d)
+
+  BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad_skip_16x8, vpx_sad16x8_avg,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8, vpx_sad16x8x4d, vpx_sad_skip_16x8x4d)
+
+  BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad_skip_8x16, vpx_sad8x16_avg,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16, vpx_sad8x16x4d, vpx_sad_skip_8x16x4d)
+
+  BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad_skip_8x8, vpx_sad8x8_avg, vpx_variance8x8,
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
+      vpx_sad_skip_8x8x4d)
+
+  BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad_skip_8x4, vpx_sad8x4_avg, vpx_variance8x4,
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
+      vpx_sad_skip_8x4x4d)
+
+  BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad_skip_4x8, vpx_sad4x8_avg, vpx_variance4x8,
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
+      vpx_sad_skip_4x8x4d)
+
+  BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad_skip_4x4, vpx_sad4x4_avg, vpx_variance4x4,
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
+      vpx_sad_skip_4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -2689,8 +2771,6 @@ VP9_COMP *vp9_create_compressor(const VP9EncoderConfig *oxcf,
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T), (V))
 #endif  // CONFIG_INTERNAL_STATS
 
-static void free_tpl_buffer(VP9_COMP *cpi);
-
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
@@ -2784,7 +2864,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 #if 0
     {
       printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
-      printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+      printf("\n_frames receive_data encod_mb_row compress_frame  Total\n");
       printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame,
              cpi->time_receive_data / 1000, cpi->time_encode_sb_row / 1000,
              cpi->time_compress_data / 1000,
@@ -2804,7 +2884,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
     vpx_free(cpi->kmeans_data_arr);
   }
 
-  free_tpl_buffer(cpi);
+  vp9_free_tpl_buffer(cpi);
 
   vp9_loop_filter_dealloc(&cpi->lf_row_sync);
   vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
@@ -2824,6 +2904,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
 
   vp9_extrc_delete(&cpi->ext_ratectrl);
 
+  // Help detect use after free of the error detail string.
+  memset(cm->error.detail, 'A', sizeof(cm->error.detail) - 1);
+  cm->error.detail[sizeof(cm->error.detail) - 1] = '\0';
+
   vp9_remove_common(cm);
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
@@ -2893,7 +2977,7 @@ void vp9_update_reference(VP9_COMP *cpi, int ref_frame_flags) {
 
 static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
     VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
-  MV_REFERENCE_FRAME ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame = NO_REF_FRAME;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
   else if (ref_frame_flag == VP9_GOLD_FLAG)
@@ -2901,7 +2985,8 @@ static YV12_BUFFER_CONFIG *get_vp9_ref_frame_buffer(
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
-  return ref_frame == NONE ? NULL : get_ref_frame_buffer(cpi, ref_frame);
+  return ref_frame == NO_REF_FRAME ? NULL
+                                   : get_ref_frame_buffer(cpi, ref_frame);
 }
 
 int vp9_copy_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag,
@@ -2994,12 +3079,11 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) {
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst,
-                                                int bd) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd) {
 #else
-static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
-                                                YV12_BUFFER_CONFIG *dst) {
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
@@ -3044,6 +3128,23 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
+
+  // The issue b/311394513 reveals a corner case bug.
+  // For bd = 8, vpx_scaled_2d() requires both x_step_q4 and y_step_q4 are less
+  // than or equal to 64. For bd >= 10, vpx_highbd_convolve8() requires both
+  // x_step_q4 and y_step_q4 are less than or equal to 32. If this condition
+  // isn't met, it needs to call vp9_scale_and_extend_frame_nonnormative() that
+  // supports arbitrary scaling.
+  const int x_step_q4 = 16 * src_w / dst_w;
+  const int y_step_q4 = 16 * src_h / dst_h;
+  const int is_arbitrary_scaling =
+      (bd == 8 && (x_step_q4 > 64 || y_step_q4 > 64)) ||
+      (bd >= 10 && (x_step_q4 > 32 || y_step_q4 > 32));
+  if (is_arbitrary_scaling) {
+    vp9_scale_and_extend_frame_nonnormative(src, dst, bd);
+    return;
+  }
+
   const uint8_t *const srcs[3] = { src->y_buffer, src->u_buffer,
                                    src->v_buffer };
   const int src_strides[3] = { src->y_stride, src->uv_stride, src->uv_stride };
@@ -3352,19 +3453,6 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
 
-static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
-  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
-  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
-      new_fb_ptr->mi_cols < cm->mi_cols) {
-    vpx_free(new_fb_ptr->mvs);
-    CHECK_MEM_ERROR(cm, new_fb_ptr->mvs,
-                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
-                                         sizeof(*new_fb_ptr->mvs)));
-    new_fb_ptr->mi_rows = cm->mi_rows;
-    new_fb_ptr->mi_cols = cm->mi_cols;
-  }
-}
-
 void vp9_scale_references(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
@@ -3711,7 +3799,7 @@ static void set_size_dependent_vars(VP9_COMP *cpi, int *q, int *bottom_index,
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+      CHECK_MEM_ERROR(&cm->error, cpi->common.postproc_state.limits,
                       vpx_calloc(cpi->un_scaled_source->y_width,
                                  sizeof(*cpi->common.postproc_state.limits)));
     }
@@ -3800,6 +3888,7 @@ static void set_frame_size(VP9_COMP *cpi) {
   alloc_util_frame_buffers(cpi);
   init_motion_estimation(cpi);
 
+  int has_valid_ref_frame = 0;
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
@@ -3818,22 +3907,25 @@ static void set_frame_size(VP9_COMP *cpi) {
                                         buf->y_crop_height, cm->width,
                                         cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      has_valid_ref_frame |= vp9_is_valid_scale(&ref_buf->sf);
       if (vp9_is_scaled(&ref_buf->sf)) vpx_extend_frame_borders(buf);
     } else {
       ref_buf->buf = NULL;
     }
   }
+  if (!frame_is_intra_only(cm) && !has_valid_ref_frame) {
+    vpx_internal_error(
+        &cm->error, VPX_CODEC_CORRUPT_FRAME,
+        "Can't find at least one reference frame with valid size");
+  }
 
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 static void save_encode_params(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  int tile_col, tile_row;
+  int tile_idx;
   int i, j;
+  TileDataEnc *tile_data;
   RD_OPT *rd_opt = &cpi->rd;
   for (i = 0; i < MAX_REF_FRAMES; i++) {
     for (j = 0; j < REFERENCE_MODES; j++)
@@ -3844,21 +3936,12 @@ static void save_encode_params(VP9_COMP *cpi) {
       rd_opt->filter_threshes_prev[i][j] = rd_opt->filter_threshes[i][j];
   }
 
-  if (cpi->tile_data != NULL) {
-    for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
-            &cpi->tile_data[tile_row * tile_cols + tile_col];
-        for (i = 0; i < BLOCK_SIZES; ++i) {
-          for (j = 0; j < MAX_MODES; ++j) {
-            tile_data->thresh_freq_fact_prev[i][j] =
-                tile_data->thresh_freq_fact[i][j];
-          }
-        }
-      }
+  for (tile_idx = 0; tile_idx < cpi->allocated_tiles; tile_idx++) {
+    assert(cpi->tile_data);
+    tile_data = &cpi->tile_data[tile_idx];
+    vp9_copy(tile_data->thresh_freq_fact_prev, tile_data->thresh_freq_fact);
   }
 }
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
 
 static INLINE void set_raw_source_frame(VP9_COMP *cpi) {
 #ifdef ENABLE_KF_DENOISE
@@ -4005,6 +4088,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   cpi->rc.hybrid_intra_scene_change = 0;
   cpi->rc.re_encode_maxq_scene_change = 0;
   if (cm->show_frame && cpi->oxcf.mode == REALTIME &&
+      !cpi->disable_scene_detection_rtc_ratectrl &&
       (cpi->oxcf.rc_mode == VPX_VBR ||
        cpi->oxcf.content == VP9E_CONTENT_SCREEN ||
        (cpi->oxcf.speed >= 5 && cpi->oxcf.speed < 8)))
@@ -4067,7 +4151,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
       svc->spatial_layer_id == svc->number_spatial_layers - 2) {
     if (svc->prev_partition_svc == NULL) {
       CHECK_MEM_ERROR(
-          cm, svc->prev_partition_svc,
+          &cm->error, svc->prev_partition_svc,
           (BLOCK_SIZE *)vpx_calloc(cm->mi_stride * cm->mi_rows,
                                    sizeof(*svc->prev_partition_svc)));
     }
@@ -4419,10 +4503,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
-  const FRAME_UPDATE_TYPE update_type =
-      cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
-  const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
-  RATE_QSTEP_MODEL *rq_model = &cpi->rq_model[frame_type];
+  RATE_QSTEP_MODEL *rq_model;
+  {
+    const FRAME_UPDATE_TYPE update_type =
+        cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index];
+    const ENCODE_FRAME_TYPE frame_type = get_encode_frame_type(update_type);
+    rq_model = &cpi->rq_model[frame_type];
+  }
   init_rq_history(rq_history);
 #endif  // CONFIG_RATE_CTRL
 
@@ -4438,6 +4525,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
                                        (cpi->twopass.gf_group.index == 1)
                                  : 0;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  printf("\n Encoding a frame: \n");
+#endif
   do {
     vpx_clear_system_state();
 
@@ -4525,7 +4615,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
     }
 #endif  // CONFIG_RATE_CTRL
     if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
-        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0 &&
+        cpi->ext_ratectrl.funcs.get_encodeframe_decision != NULL) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4825,6 +4916,9 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
 
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF)
       if (loop) restore_coding_context(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    if (loop) printf("\n Recoding:");
+#endif
   } while (loop);
 
   rc->max_frame_bandwidth = orig_rc_max_frame_bandwidth;
@@ -4922,13 +5016,14 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
         scale_and_extend_frame(unscaled, scaled, (int)cm->bit_depth,
                                filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled, (int)cm->bit_depth);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled,
+                                              (int)cm->bit_depth);
 #else
     if (use_normative_scaler && unscaled->y_width <= (scaled->y_width << 1) &&
         unscaled->y_height <= (scaled->y_height << 1))
       vp9_scale_and_extend_frame(unscaled, scaled, filter_type, phase_scaler);
     else
-      scale_and_extend_frame_nonnormative(unscaled, scaled);
+      vp9_scale_and_extend_frame_nonnormative(unscaled, scaled);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return scaled;
   } else {
@@ -4980,8 +5075,8 @@ static int setup_interp_filter_search_mask(VP9_COMP *cpi) {
 
 #ifdef ENABLE_KF_DENOISE
 // Baseline kernel weights for denoise
-static uint8_t dn_kernal_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
-static uint8_t dn_kernal_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
+static uint8_t dn_kernel_3[9] = { 1, 2, 1, 2, 4, 2, 1, 2, 1 };
+static uint8_t dn_kernel_5[25] = { 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 4,
                                    2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1 };
 
 static INLINE void add_denoise_point(int centre_val, int data_val, int thresh,
@@ -4998,17 +5093,17 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)*src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -5016,19 +5111,19 @@ static void spatial_denoise_point(uint8_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -5043,17 +5138,17 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
   int sum_weight = 0;
   int sum_val = 0;
   int thresh = strength;
-  int kernal_size = 5;
+  int kernel_size = 5;
   int half_k_size = 2;
   int i, j;
   int max_diff = 0;
   uint16_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
 
   // Find the maximum deviation from the source point in the locale.
   tmp_ptr = src_ptr - (stride * (half_k_size + 1)) - (half_k_size + 1);
-  for (i = 0; i < kernal_size + 2; ++i) {
-    for (j = 0; j < kernal_size + 2; ++j) {
+  for (i = 0; i < kernel_size + 2; ++i) {
+    for (j = 0; j < kernel_size + 2; ++j) {
       max_diff = VPXMAX(max_diff, abs((int)src_ptr - (int)tmp_ptr[j]));
     }
     tmp_ptr += stride;
@@ -5061,19 +5156,19 @@ static void highbd_spatial_denoise_point(uint16_t *src_ptr, const int stride,
 
   // Select the kernel size.
   if (max_diff > (strength + (strength >> 1))) {
-    kernal_size = 3;
+    kernel_size = 3;
     half_k_size = 1;
     thresh = thresh >> 1;
   }
-  kernal_ptr = (kernal_size == 3) ? dn_kernal_3 : dn_kernal_5;
+  kernel_ptr = (kernel_size == 3) ? dn_kernel_3 : dn_kernel_5;
 
   // Apply the kernel
   tmp_ptr = src_ptr - (stride * half_k_size) - half_k_size;
-  for (i = 0; i < kernal_size; ++i) {
-    for (j = 0; j < kernal_size; ++j) {
-      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernal_ptr,
+  for (i = 0; i < kernel_size; ++i) {
+    for (j = 0; j < kernel_size; ++j) {
+      add_denoise_point((int)*src_ptr, (int)tmp_ptr[j], thresh, *kernel_ptr,
                         &sum_val, &sum_weight);
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -5260,7 +5355,7 @@ static void init_mb_wiener_var_buffer(VP9_COMP *cpi) {
   cpi->mb_wiener_variance = NULL;
 
   CHECK_MEM_ERROR(
-      cm, cpi->mb_wiener_variance,
+      &cm->error, cpi->mb_wiener_variance,
       vpx_calloc(cm->mb_rows * cm->mb_cols, sizeof(*cpi->mb_wiener_variance)));
   cpi->mb_wiener_var_rows = cm->mb_rows;
   cpi->mb_wiener_var_cols = cm->mb_cols;
@@ -5319,16 +5414,16 @@ static void set_mb_wiener_variance(VP9_COMP *cpi) {
         vpx_highbd_subtract_block(block_size, block_size, src_diff, block_size,
                                   mb_buffer, buf_stride, zero_pred, block_size,
                                   xd->bd);
-        highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_highbd_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       } else {
         vpx_subtract_block(block_size, block_size, src_diff, block_size,
                            mb_buffer, buf_stride, zero_pred, block_size);
-        wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+        vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
       }
 #else
       vpx_subtract_block(block_size, block_size, src_diff, block_size,
                          mb_buffer, buf_stride, zero_pred, block_size);
-      wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
+      vp9_wht_fwd_txfm(src_diff, block_size, coeff, tx_size);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
       coeff[0] = 0;
@@ -5447,26 +5542,7 @@ static void encode_frame_to_data_rate(
   struct segmentation *const seg = &cm->seg;
   TX_SIZE t;
 
-  // SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
-  // No need to set svc.skip_enhancement_layer if whole superframe will be
-  // dropped.
-  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
-      cpi->oxcf.target_bandwidth == 0 &&
-      !(cpi->svc.framedrop_mode != LAYER_DROP &&
-        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
-         cpi->svc
-             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
-                                                1]) &&
-        cpi->svc.drop_spatial_layer[0])) {
-    cpi->svc.skip_enhancement_layer = 1;
-    vp9_rc_postencode_update_drop_frame(cpi);
-    cpi->ext_refresh_frame_flags_pending = 0;
-    cpi->last_frame_dropped = 1;
-    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
-    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
-    vp9_inc_frame_in_layer(cpi);
-    return;
-  }
+  if (vp9_svc_check_skip_enhancement_layer(cpi)) return;
 
   set_ext_overrides(cpi);
   vpx_clear_system_state();
@@ -5484,6 +5560,11 @@ static void encode_frame_to_data_rate(
     set_ref_sign_bias(cpi);
   }
 
+  // On the very first frame set the deadline_mode_previous_frame to
+  // the current mode.
+  if (cpi->common.current_video_frame == 0)
+    cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -5531,16 +5612,11 @@ static void encode_frame_to_data_rate(
   memset(cpi->mode_chosen_counts, 0,
          MAX_MODES * sizeof(*cpi->mode_chosen_counts));
 #endif
-#if CONFIG_CONSISTENT_RECODE
   // Backup to ensure consistency between recodes
   save_encode_params(cpi);
-#elif CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    save_encode_params(cpi);
-  }
-#endif
   if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.funcs.get_frame_rdmult != NULL) {
     vpx_codec_err_t codec_status;
     const GF_GROUP *gf_group = &cpi->twopass.gf_group;
     FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
@@ -5572,8 +5648,14 @@ static void encode_frame_to_data_rate(
 #if !CONFIG_REALTIME_ONLY
 #if CONFIG_RATE_CTRL
     encode_with_recode_loop(cpi, size, dest, &encode_frame_result->rq_history);
-#else   // CONFIG_RATE_CTRL
+#else  // CONFIG_RATE_CTRL
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, encode_with_recode_loop_time);
+#endif
     encode_with_recode_loop(cpi, size, dest);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, encode_with_recode_loop_time);
+#endif
 #endif  // CONFIG_RATE_CTRL
 #endif  // !CONFIG_REALTIME_ONLY
   }
@@ -5632,15 +5714,28 @@ static void encode_frame_to_data_rate(
   cm->frame_to_show->render_width = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, loopfilter_frame_time);
+#endif
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, loopfilter_frame_time);
+#endif
 
   if (cpi->rc.use_post_encode_drop) save_coding_context(cpi);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, vp9_pack_bitstream_time);
+#endif
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, vp9_pack_bitstream_time);
+#endif
 
-  if (cpi->ext_ratectrl.ready) {
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.update_encodeframe_result != NULL) {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
@@ -6228,1391 +6323,6 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) {
   }
 }
 
-typedef struct GF_PICTURE {
-  YV12_BUFFER_CONFIG *frame;
-  int ref_frame[3];
-  FRAME_UPDATE_TYPE update_type;
-} GF_PICTURE;
-
-static void init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                            const GF_GROUP *gf_group, int *tpl_group_frames) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame_idx = 0;
-  int i;
-  int gld_index = -1;
-  int alt_index = -1;
-  int lst_index = -1;
-  int arf_index_stack[MAX_ARF_LAYERS];
-  int arf_stack_size = 0;
-  int extend_frame_count = 0;
-  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
-  int frame_gop_offset = 0;
-
-  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
-  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
-
-  memset(recon_frame_index, -1, sizeof(recon_frame_index));
-  stack_init(arf_index_stack, MAX_ARF_LAYERS);
-
-  // TODO(jingning): To be used later for gf frame type parsing.
-  (void)gf_group;
-
-  for (i = 0; i < FRAME_BUFFERS; ++i) {
-    if (frame_bufs[i].ref_count == 0) {
-      alloc_frame_mvs(cm, i);
-      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
-                                   cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                   cm->use_highbitdepth,
-#endif
-                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
-                                   NULL, NULL, NULL))
-        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                           "Failed to allocate frame buffer");
-
-      recon_frame_index[frame_idx] = i;
-      ++frame_idx;
-
-      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
-    }
-  }
-
-  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
-    assert(recon_frame_index[i] >= 0);
-    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
-  }
-
-  *tpl_group_frames = 0;
-
-  // Initialize Golden reference frame.
-  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
-  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
-  gf_picture[0].update_type = gf_group->update_type[0];
-  gld_index = 0;
-  ++*tpl_group_frames;
-
-  // Initialize base layer ARF frame
-  gf_picture[1].frame = cpi->Source;
-  gf_picture[1].ref_frame[0] = gld_index;
-  gf_picture[1].ref_frame[1] = lst_index;
-  gf_picture[1].ref_frame[2] = alt_index;
-  gf_picture[1].update_type = gf_group->update_type[1];
-  alt_index = 1;
-  ++*tpl_group_frames;
-
-  // Initialize P frames
-  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    struct lookahead_entry *buf;
-    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
-    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
-
-    switch (gf_group->update_type[frame_idx]) {
-      case ARF_UPDATE:
-        stack_push(arf_index_stack, alt_index, arf_stack_size);
-        ++arf_stack_size;
-        alt_index = frame_idx;
-        break;
-      case LF_UPDATE: lst_index = frame_idx; break;
-      case OVERLAY_UPDATE:
-        gld_index = frame_idx;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
-        break;
-      case USE_BUF_FRAME:
-        lst_index = alt_index;
-        alt_index = stack_pop(arf_index_stack, arf_stack_size);
-        --arf_stack_size;
-        break;
-      default: break;
-    }
-
-    ++*tpl_group_frames;
-
-    // The length of group of pictures is baseline_gf_interval, plus the
-    // beginning golden frame from last GOP, plus the last overlay frame in
-    // the same GOP.
-    if (frame_idx == gf_group->gf_group_size) break;
-  }
-
-  alt_index = -1;
-  ++frame_idx;
-  ++frame_gop_offset;
-
-  // Extend two frames outside the current gf group.
-  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
-    struct lookahead_entry *buf =
-        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
-
-    if (buf == NULL) break;
-
-    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
-
-    gf_picture[frame_idx].frame = &buf->img;
-    gf_picture[frame_idx].ref_frame[0] = gld_index;
-    gf_picture[frame_idx].ref_frame[1] = lst_index;
-    gf_picture[frame_idx].ref_frame[2] = alt_index;
-    gf_picture[frame_idx].update_type = LF_UPDATE;
-    lst_index = frame_idx;
-    ++*tpl_group_frames;
-    ++extend_frame_count;
-    ++frame_gop_offset;
-  }
-}
-
-static void init_tpl_stats(VP9_COMP *cpi) {
-  int frame_idx;
-  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    memset(tpl_frame->tpl_stats_ptr, 0,
-           tpl_frame->height * tpl_frame->width *
-               sizeof(*tpl_frame->tpl_stats_ptr));
-    tpl_frame->is_valid = 0;
-  }
-}
-
-#if CONFIG_NON_GREEDY_MV
-static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                         MotionField *motion_field,
-                                         int frame_idx, uint8_t *cur_frame_buf,
-                                         uint8_t *ref_frame_buf, int stride,
-                                         BLOCK_SIZE bsize, int mi_row,
-                                         int mi_col, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  int step_param;
-  uint32_t bestsme = UINT_MAX;
-  const MvLimits tmp_mv_limits = x->mv_limits;
-  // lambda is used to adjust the importance of motion vector consistency.
-  // TODO(angiebird): Figure out lambda's proper value.
-  const int lambda = cpi->tpl_stats[frame_idx].lambda;
-  int_mv nb_full_mvs[NB_MVS_NUM];
-  int nb_full_mv_num;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  nb_full_mv_num =
-      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
-  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  return bestsme;
-}
-
-static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
-                                        uint8_t *cur_frame_buf,
-                                        uint8_t *ref_frame_buf, int stride,
-                                        BLOCK_SIZE bsize, MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-
-  MV best_ref_mv1 = { 0, 0 };
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-
-#else  // CONFIG_NON_GREEDY_MV
-static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
-                                              uint8_t *cur_frame_buf,
-                                              uint8_t *ref_frame_buf,
-                                              int stride, BLOCK_SIZE bsize,
-                                              MV *mv) {
-  MACROBLOCK *const x = &td->mb;
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const SEARCH_METHODS search_method = NSTEP;
-  int step_param;
-  int sadpb = x->sadperbit16;
-  uint32_t bestsme = UINT_MAX;
-  uint32_t distortion;
-  uint32_t sse;
-  int cost_list[5];
-  const MvLimits tmp_mv_limits = x->mv_limits;
-
-  MV best_ref_mv1 = { 0, 0 };
-  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-
-  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
-  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
-
-  // Setup frame pointers
-  x->plane[0].src.buf = cur_frame_buf;
-  x->plane[0].src.stride = stride;
-  xd->plane[0].pre[0].buf = ref_frame_buf;
-  xd->plane[0].pre[0].stride = stride;
-
-  step_param = mv_sf->reduce_first_step_size;
-  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
-
-  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
-
-  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
-                        search_method, sadpb, cond_cost_list(cpi, cost_list),
-                        &best_ref_mv1, mv, 0, 0);
-
-  /* restore UMV window */
-  x->mv_limits = tmp_mv_limits;
-
-  // TODO(yunqing): may use higher tap interp filter than 2 taps.
-  // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(
-      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
-      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
-      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
-      USE_2_TAPS);
-
-  return bestsme;
-}
-#endif
-
-static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
-                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
-  int width = 0, height = 0;
-  int bw = 4 << b_width_log2_lookup[bsize];
-  int bh = 4 << b_height_log2_lookup[bsize];
-
-  switch (block) {
-    case 0:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 1:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = grid_pos_row + bh - ref_pos_row;
-      break;
-    case 2:
-      width = grid_pos_col + bw - ref_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    case 3:
-      width = ref_pos_col + bw - grid_pos_col;
-      height = ref_pos_row + bh - grid_pos_row;
-      break;
-    default: assert(0);
-  }
-
-  return width * height;
-}
-
-static int round_floor(int ref_pos, int bsize_pix) {
-  int round;
-  if (ref_pos < 0)
-    round = -(1 + (-ref_pos - 1) / bsize_pix);
-  else
-    round = ref_pos / bsize_pix;
-
-  return round;
-}
-
-static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
-                            BLOCK_SIZE bsize, int stride) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
-  int idx, idy;
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
-      const int64_t mc_flow = tpl_ptr->mc_flow;
-      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
-      *tpl_ptr = *src_stats;
-      tpl_ptr->mc_flow = mc_flow;
-      tpl_ptr->mc_ref_cost = mc_ref_cost;
-      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
-    }
-  }
-}
-
-static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
-  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
-  MV mv = tpl_stats->mv.as_mv;
-  int mv_row = mv.row >> 3;
-  int mv_col = mv.col >> 3;
-
-  int ref_pos_row = mi_row * MI_SIZE + mv_row;
-  int ref_pos_col = mi_col * MI_SIZE + mv_col;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pix_num = bw * bh;
-
-  // top-left on grid block location in pixel
-  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
-  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
-  int block;
-
-  for (block = 0; block < 4; ++block) {
-    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
-    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
-
-    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
-        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
-      int overlap_area = get_overlap_area(
-          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
-      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
-      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
-
-      int64_t mc_flow = tpl_stats->mc_dep_cost -
-                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
-                            tpl_stats->intra_cost;
-
-      int idx, idy;
-
-      for (idy = 0; idy < mi_height; ++idy) {
-        for (idx = 0; idx < mi_width; ++idx) {
-          TplDepStats *des_stats =
-              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
-                         (ref_mi_col + idx)];
-
-          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
-          des_stats->mc_ref_cost +=
-              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
-              pix_num;
-          assert(overlap_area >= 0);
-        }
-      }
-    }
-  }
-}
-
-static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
-                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
-  int idx, idy;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-
-  for (idy = 0; idy < mi_height; ++idy) {
-    for (idx = 0; idx < mi_width; ++idx) {
-      TplDepStats *tpl_ptr =
-          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
-      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
-                         BLOCK_8X8);
-    }
-  }
-}
-
-static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
-                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
-                               TX_SIZE tx_size, int64_t *recon_error,
-                               int64_t *sse) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  uint16_t eob;
-  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
-  const int shift = tx_size == TX_32X32 ? 0 : 2;
-
-  // skip block condition should be handled before this is called.
-  assert(!x->skip_block);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
-                                 qcoeff, dqcoeff, pd->dequant, &eob,
-                                 scan_order->scan, scan_order->iscan);
-  } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                          dqcoeff, pd->dequant, &eob, scan_order->scan,
-                          scan_order->iscan);
-  }
-#else
-  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
-                        dqcoeff, pd->dequant, &eob, scan_order->scan,
-                        scan_order->iscan);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
-  *recon_error = VPXMAX(*recon_error, 1);
-
-  *sse = (*sse) >> shift;
-  *sse = VPXMAX(*sse, 1);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                         TX_SIZE tx_size) {
-  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
-  switch (tx_size) {
-    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
-                  TX_SIZE tx_size) {
-  switch (tx_size) {
-    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
-    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
-    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
-    default: assert(0);
-  }
-}
-
-static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
-                          int mi_col) {
-  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.row_max =
-      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
-  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
-  x->mv_limits.col_max =
-      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
-}
-
-static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
-                            struct scale_factors *sf, GF_PICTURE *gf_picture,
-                            int frame_idx, TplDepFrame *tpl_frame,
-                            int16_t *src_diff, tran_low_t *coeff,
-                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
-                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
-                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
-                            int64_t *recon_error, int64_t *sse) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-
-  const int bw = 4 << b_width_log2_lookup[bsize];
-  const int bh = 4 << b_height_log2_lookup[bsize];
-  const int pix_num = bw * bh;
-  int best_rf_idx = -1;
-  int_mv best_mv;
-  int64_t best_inter_cost = INT64_MAX;
-  int64_t inter_cost;
-  int rf_idx;
-  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
-
-  int64_t best_intra_cost = INT64_MAX;
-  int64_t intra_cost;
-  PREDICTION_MODE mode;
-  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  MODE_INFO mi_above, mi_left;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  TplDepStats *tpl_stats =
-      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-
-  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
-  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
-  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
-  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
-  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
-  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
-
-  // Intra prediction search
-  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
-    uint8_t *src, *dst;
-    int src_stride, dst_stride;
-
-    src = xd->cur_buf->y_buffer + mb_y_offset;
-    src_stride = xd->cur_buf->y_stride;
-
-    dst = &predictor[0];
-    dst_stride = bw;
-
-    xd->mi[0]->sb_type = bsize;
-    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-
-    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
-                            src_stride, dst, dst_stride, 0, 0, 0);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                                dst_stride, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
-                         dst_stride);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      intra_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    intra_cost = vpx_satd(coeff, pix_num);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
-  }
-
-  // Motion compensated prediction
-  best_mv.as_int = 0;
-
-  set_mv_limits(cm, x, mi_row, mi_col);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int_mv mv;
-#if CONFIG_NON_GREEDY_MV
-    MotionField *motion_field;
-#endif
-    if (ref_frame[rf_idx] == NULL) continue;
-
-#if CONFIG_NON_GREEDY_MV
-    (void)td;
-    motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-#else
-    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
-                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
-#endif
-
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      vp9_highbd_build_inter_predictor(
-          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
-          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
-          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
-          mi_row * MI_SIZE, xd->bd);
-      vpx_highbd_subtract_block(
-          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
-          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
-      highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_highbd_satd(coeff, pix_num);
-    } else {
-      vp9_build_inter_predictor(
-          ref_frame[rf_idx]->y_buffer + mb_y_offset,
-          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
-          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
-      vpx_subtract_block(bh, bw, src_diff, bw,
-                         xd->cur_buf->y_buffer + mb_y_offset,
-                         xd->cur_buf->y_stride, &predictor[0], bw);
-      wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-      inter_cost = vpx_satd(coeff, pix_num);
-    }
-#else
-    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
-                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
-                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
-                              mi_col * MI_SIZE, mi_row * MI_SIZE);
-    vpx_subtract_block(bh, bw, src_diff, bw,
-                       xd->cur_buf->y_buffer + mb_y_offset,
-                       xd->cur_buf->y_stride, &predictor[0], bw);
-    wht_fwd_txfm(src_diff, bw, coeff, tx_size);
-    inter_cost = vpx_satd(coeff, pix_num);
-#endif
-
-    if (inter_cost < best_inter_cost) {
-      best_rf_idx = rf_idx;
-      best_inter_cost = inter_cost;
-      best_mv.as_int = mv.as_int;
-      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
-                         sse);
-    }
-  }
-  best_intra_cost = VPXMAX(best_intra_cost, 1);
-  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
-  tpl_stats->inter_cost = VPXMAX(
-      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->intra_cost = VPXMAX(
-      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
-  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
-  tpl_stats->mv.as_int = best_mv.as_int;
-}
-
-#if CONFIG_NON_GREEDY_MV
-static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
-                                  int frame_idx, int rf_idx, int mi_row,
-                                  int mi_col, struct buf_2d *src,
-                                  struct buf_2d *pre) {
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  YV12_BUFFER_CONFIG *ref_frame = NULL;
-  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-  if (ref_frame_idx != -1) {
-    ref_frame = gf_picture[ref_frame_idx].frame;
-    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
-    src->stride = xd->cur_buf->y_stride;
-    pre->buf = ref_frame->y_buffer + mb_y_offset;
-    pre->stride = ref_frame->y_stride;
-    assert(src->stride == pre->stride);
-    return 1;
-  } else {
-    printf("invalid ref_frame_idx");
-    assert(ref_frame_idx != -1);
-    return 0;
-  }
-}
-
-#define kMvPreCheckLines 5
-#define kMvPreCheckSize 15
-
-#define MV_REF_POS_NUM 3
-POSITION mv_ref_pos[MV_REF_POS_NUM] = {
-  { -1, 0 },
-  { 0, -1 },
-  { -1, -1 },
-};
-
-static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
-                             int mi_col) {
-  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
-}
-
-static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  int i;
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int_mv nearest_mv, near_mv, invalid_mv;
-  nearest_mv.as_int = INVALID_MV;
-  near_mv.as_int = INVALID_MV;
-  invalid_mv.as_int = INVALID_MV;
-  for (i = 0; i < MV_REF_POS_NUM; ++i) {
-    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
-    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
-    assert(mv_ref_pos[i].row <= 0);
-    assert(mv_ref_pos[i].col <= 0);
-    if (nb_row >= 0 && nb_col >= 0) {
-      if (nearest_mv.as_int == INVALID_MV) {
-        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-      } else {
-        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
-        if (mv.as_int == nearest_mv.as_int) {
-          continue;
-        } else {
-          near_mv = mv;
-          break;
-        }
-      }
-    }
-  }
-  if (nearest_mv.as_int == INVALID_MV) {
-    nearest_mv.as_mv.row = 0;
-    nearest_mv.as_mv.col = 0;
-  }
-  if (near_mv.as_int == INVALID_MV) {
-    near_mv.as_mv.row = 0;
-    near_mv.as_mv.col = 0;
-  }
-  if (mv_mode == NEAREST_MV_MODE) {
-    return nearest_mv;
-  }
-  if (mv_mode == NEAR_MV_MODE) {
-    return near_mv;
-  }
-  assert(0);
-  return invalid_mv;
-}
-
-static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
-                                  MotionField *motion_field,
-                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
-                                  int mi_row, int mi_col) {
-  int_mv mv;
-  switch (mv_mode) {
-    case ZERO_MV_MODE:
-      mv.as_mv.row = 0;
-      mv.as_mv.col = 0;
-      break;
-    case NEW_MV_MODE:
-      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-      break;
-    case NEAREST_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    case NEAR_MV_MODE:
-      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
-      break;
-    default:
-      mv.as_int = INVALID_MV;
-      assert(0);
-      break;
-  }
-  return mv;
-}
-
-static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
-                          GF_PICTURE *gf_picture, MotionField *motion_field,
-                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                          BLOCK_SIZE bsize, int mi_row, int mi_col,
-                          int_mv *mv) {
-  uint32_t sse;
-  struct buf_2d src;
-  struct buf_2d pre;
-  MV full_mv;
-  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
-                            mi_row, mi_col);
-  full_mv = get_full_mv(&mv->as_mv);
-  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
-                             &src, &pre)) {
-    // TODO(angiebird): Consider subpixel when computing the sse.
-    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
-                          pre.stride, &sse);
-    return (double)(sse << VP9_DIST_SCALE_LOG2);
-  } else {
-    assert(0);
-    return 0;
-  }
-}
-
-static int get_mv_mode_cost(int mv_mode) {
-  // TODO(angiebird): The probabilities are roughly inferred from
-  // default_inter_mode_probs. Check if there is a better way to set the
-  // probabilities.
-  const int zero_mv_prob = 16;
-  const int new_mv_prob = 24 * 1;
-  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
-  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
-  switch (mv_mode) {
-    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
-    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
-    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
-    default: assert(0); return -1;
-  }
-}
-
-static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
-  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
-                        log2(1 + abs(new_mv->col - ref_mv->col));
-  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
-  return mv_diff_cost;
-}
-static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
-                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
-                          int mi_col) {
-  double mv_cost = get_mv_mode_cost(mv_mode);
-  if (mv_mode == NEW_MV_MODE) {
-    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
-                                    bsize, mi_row, mi_col)
-                    .as_mv;
-    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
-                                        tpl_frame, bsize, mi_row, mi_col)
-                        .as_mv;
-    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
-                                     bsize, mi_row, mi_col)
-                     .as_mv;
-    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
-    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
-    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
-  }
-  return mv_cost;
-}
-
-static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
-                           GF_PICTURE *gf_picture, MotionField *motion_field,
-                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                           BLOCK_SIZE bsize, int mi_row, int mi_col,
-                           int_mv *mv) {
-  MACROBLOCKD *xd = &x->e_mbd;
-  double mv_dist =
-      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
-                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
-  double mv_cost =
-      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
-  double mult = 180;
-
-  return mv_cost + mult * log2f(1 + mv_dist);
-}
-
-static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                                 GF_PICTURE *gf_picture,
-                                 MotionField *motion_field, int frame_idx,
-                                 TplDepFrame *tpl_frame, int rf_idx,
-                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
-                                 double *rd, int_mv *mv) {
-  int best_mv_mode = ZERO_MV_MODE;
-  int update = 0;
-  int mv_mode;
-  *rd = 0;
-  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
-    double this_rd;
-    int_mv this_mv;
-    if (mv_mode == NEW_MV_MODE) {
-      continue;
-    }
-    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
-                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
-    if (update == 0) {
-      *rd = this_rd;
-      *mv = this_mv;
-      best_mv_mode = mv_mode;
-      update = 1;
-    } else {
-      if (this_rd < *rd) {
-        *rd = this_rd;
-        *mv = this_mv;
-        best_mv_mode = mv_mode;
-      }
-    }
-  }
-  return best_mv_mode;
-}
-
-static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
-                            GF_PICTURE *gf_picture, MotionField *motion_field,
-                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
-                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int tmp_mv_mode_arr[kMvPreCheckSize];
-  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
-  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
-  int_mv *select_mv_arr = cpi->select_mv_arr;
-  int_mv tmp_select_mv_arr[kMvPreCheckSize];
-  int stride = tpl_frame->stride;
-  double new_mv_rd = 0;
-  double no_new_mv_rd = 0;
-  double this_new_mv_rd = 0;
-  double this_no_new_mv_rd = 0;
-  int idx;
-  int tmp_idx;
-  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
-
-  // no new mv
-  // diagonal scan order
-  tmp_idx = 0;
-  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        if (r == 0 && c == 0) {
-          this_no_new_mv_rd = this_rd;
-        }
-        no_new_mv_rd += this_rd;
-        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
-        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
-        ++tmp_idx;
-      }
-    }
-  }
-
-  // new mv
-  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
-  this_new_mv_rd = eval_mv_mode(
-      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
-  new_mv_rd = this_new_mv_rd;
-  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
-  // beforehand.
-  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
-    int r;
-    for (r = 0; r <= idx; ++r) {
-      int c = idx - r;
-      int nb_row = mi_row + r * mi_height;
-      int nb_col = mi_col + c * mi_width;
-      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-        double this_rd;
-        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
-        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
-            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
-            bsize, nb_row, nb_col, &this_rd, mv);
-        new_mv_rd += this_rd;
-      }
-    }
-  }
-
-  // update best_mv_mode
-  tmp_idx = 0;
-  if (no_new_mv_rd < new_mv_rd) {
-    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
-      int r;
-      for (r = 0; r <= idx; ++r) {
-        int c = idx - r;
-        int nb_row = mi_row + r * mi_height;
-        int nb_col = mi_col + c * mi_width;
-        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
-          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
-          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
-          ++tmp_idx;
-        }
-      }
-    }
-    rd_diff_arr[mi_row * stride + mi_col] = 0;
-  } else {
-    rd_diff_arr[mi_row * stride + mi_col] =
-        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
-  }
-}
-
-static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
-                                GF_PICTURE *gf_picture,
-                                MotionField *motion_field, int frame_idx,
-                                TplDepFrame *tpl_frame, int rf_idx,
-                                BLOCK_SIZE bsize) {
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int unit_rows = tpl_frame->mi_rows / mi_height;
-  const int unit_cols = tpl_frame->mi_cols / mi_width;
-  const int max_diagonal_lines = unit_rows + unit_cols - 1;
-  int idx;
-  for (idx = 0; idx < max_diagonal_lines; ++idx) {
-    int r;
-    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
-         ++r) {
-      int c = idx - r;
-      int mi_row = r * mi_height;
-      int mi_col = c * mi_width;
-      assert(c >= 0 && c < unit_cols);
-      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
-      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
-      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
-                      rf_idx, bsize, mi_row, mi_col);
-    }
-  }
-}
-
-static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
-                             MotionField *motion_field, int frame_idx,
-                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
-                             int mi_row, int mi_col) {
-  VP9_COMMON *cm = &cpi->common;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  const int mb_y_offset =
-      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
-  assert(ref_frame != NULL);
-  set_mv_limits(cm, x, mi_row, mi_col);
-  {
-    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
-    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
-    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
-    const int stride = xd->cur_buf->y_stride;
-    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
-                             ref_frame_buf, stride, bsize, mi_row, mi_col,
-                             &mv.as_mv);
-    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
-                            bsize, &mv.as_mv);
-    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
-  }
-}
-
-static void build_motion_field(
-    VP9_COMP *cpi, int frame_idx,
-    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
-  VP9_COMMON *cm = &cpi->common;
-  ThreadData *td = &cpi->td;
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
-  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
-  int mi_row, mi_col;
-  int rf_idx;
-
-  tpl_frame->lambda = (pw * ph) >> 2;
-  assert(pw * ph == tpl_frame->lambda << 2);
-
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-    if (ref_frame[rf_idx] == NULL) {
-      continue;
-    }
-    vp9_motion_field_reset_mvs(motion_field);
-    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
-                         bsize, mi_row, mi_col);
-      }
-    }
-  }
-}
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
-                              int frame_idx, BLOCK_SIZE bsize) {
-  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
-  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
-
-  VP9_COMMON *cm = &cpi->common;
-  struct scale_factors sf;
-  int rdmult, idx;
-  ThreadData *td = &cpi->td;
-  MACROBLOCK *x = &td->mb;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int mi_row, mi_col;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
-  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
-  uint8_t *predictor;
-#else
-  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
-#endif
-  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-
-  const TX_SIZE tx_size = max_txsize_lookup[bsize];
-  const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-  int64_t recon_error, sse;
-#if CONFIG_NON_GREEDY_MV
-  int square_block_idx;
-  int rf_idx;
-#endif
-
-  // Setup scaling factor
-#if CONFIG_VP9_HIGHBITDEPTH
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height,
-      cpi->common.use_highbitdepth);
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    predictor = CONVERT_TO_BYTEPTR(predictor16);
-  else
-    predictor = predictor8;
-#else
-  vp9_setup_scale_factors_for_frame(
-      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
-      this_frame->y_crop_width, this_frame->y_crop_height);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  // Prepare reference frame pointers. If any reference frame slot is
-  // unavailable, the pointer will be set to Null.
-  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
-    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
-    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-  xd->cur_buf = this_frame;
-
-  // Get rd multiplier set up.
-  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
-  set_error_per_bit(&cpi->td.mb, rdmult);
-  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
-
-  tpl_frame->is_valid = 1;
-
-  cm->base_qindex = tpl_frame->base_qindex;
-  vp9_frame_init_quantizer(cpi);
-
-#if CONFIG_NON_GREEDY_MV
-  for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
-       ++square_block_idx) {
-    BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
-    build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
-  }
-  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-    int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-    if (ref_frame_idx != -1) {
-      MotionField *motion_field = vp9_motion_field_info_get_motion_field(
-          &cpi->motion_field_info, frame_idx, rf_idx, bsize);
-      predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
-                          tpl_frame, rf_idx, bsize);
-    }
-  }
-#endif
-
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
-                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
-                      tx_size, ref_frame, predictor, &recon_error, &sse);
-      // Motion flow dependency dispenser.
-      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
-                      tpl_frame->stride);
-
-      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
-                       bsize);
-    }
-  }
-}
-
-#if CONFIG_NON_GREEDY_MV
-#define DUMP_TPL_STATS 0
-#if DUMP_TPL_STATS
-static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
-  int i, j;
-  printf("%d %d\n", h, w);
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; ++j) {
-      printf("%d ", buf[(row + i) * stride + col + j]);
-    }
-  }
-  printf("\n");
-}
-
-static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
-  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
-           frame_buf->y_width);
-  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
-           frame_buf->uv_height, frame_buf->uv_width);
-}
-
-static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
-                           const GF_GROUP *gf_group,
-                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
-  int frame_idx;
-  const VP9_COMMON *cm = &cpi->common;
-  int rf_idx;
-  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-      int mi_row, mi_col;
-      int ref_frame_idx;
-      const int mi_height = num_8x8_blocks_high_lookup[bsize];
-      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
-      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
-      if (ref_frame_idx != -1) {
-        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
-        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
-        const int ref_gf_frame_offset =
-            gf_group->frame_gop_index[ref_frame_idx];
-        printf("=\n");
-        printf(
-            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
-            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
-            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
-            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
-                                                       frame_idx, rf_idx, bsize,
-                                                       mi_row, mi_col);
-              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
-                     mv.as_mv.col);
-            }
-          }
-        }
-        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
-          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
-            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
-              const TplDepStats *tpl_ptr =
-                  &tpl_frame
-                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
-              printf("%f ", tpl_ptr->feature_score);
-            }
-          }
-        }
-        printf("\n");
-
-        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
-          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
-            const int mv_mode =
-                tpl_frame
-                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
-            printf("%d ", mv_mode);
-          }
-        }
-        printf("\n");
-
-        dump_frame_buf(gf_picture[frame_idx].frame);
-        dump_frame_buf(ref_frame_buf);
-      }
-    }
-  }
-}
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-
-static void init_tpl_buffer(VP9_COMP *cpi) {
-  VP9_COMMON *cm = &cpi->common;
-  int frame;
-
-  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
-#if CONFIG_NON_GREEDY_MV
-  int rf_idx;
-
-  vpx_free(cpi->select_mv_arr);
-  CHECK_MEM_ERROR(
-      cm, cpi->select_mv_arr,
-      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
-#endif
-
-  // TODO(jingning): Reduce the actual memory use for tpl model build up.
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-    if (cpi->tpl_stats[frame].width >= mi_cols &&
-        cpi->tpl_stats[frame].height >= mi_rows &&
-        cpi->tpl_stats[frame].tpl_stats_ptr)
-      continue;
-
-#if CONFIG_NON_GREEDY_MV
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-      CHECK_MEM_ERROR(
-          cm, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
-          vpx_calloc(mi_rows * mi_cols * 4,
-                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    CHECK_MEM_ERROR(cm, cpi->tpl_stats[frame].tpl_stats_ptr,
-                    vpx_calloc(mi_rows * mi_cols,
-                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
-    cpi->tpl_stats[frame].is_valid = 0;
-    cpi->tpl_stats[frame].width = mi_cols;
-    cpi->tpl_stats[frame].height = mi_rows;
-    cpi->tpl_stats[frame].stride = mi_cols;
-    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
-    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
-  }
-
-  for (frame = 0; frame < REF_FRAMES; ++frame) {
-    cpi->enc_frame_buf[frame].mem_valid = 0;
-    cpi->enc_frame_buf[frame].released = 1;
-  }
-}
-
-static void free_tpl_buffer(VP9_COMP *cpi) {
-  int frame;
-#if CONFIG_NON_GREEDY_MV
-  vp9_free_motion_field_info(&cpi->motion_field_info);
-  vpx_free(cpi->select_mv_arr);
-#endif
-  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
-#if CONFIG_NON_GREEDY_MV
-    int rf_idx;
-    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
-      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
-      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
-    }
-#endif
-    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
-    cpi->tpl_stats[frame].is_valid = 0;
-  }
-}
-
-#if CONFIG_RATE_CTRL
-static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
-  VP9_COMMON *const cm = &cpi->common;
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int show_frame_count = 0;
-  int frame_idx;
-  // Accumulate tpl stats for each frame in the current group of picture.
-  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
-    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
-    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
-    const int tpl_stride = tpl_frame->stride;
-    int64_t intra_cost_base = 0;
-    int64_t inter_cost_base = 0;
-    int64_t mc_dep_cost_base = 0;
-    int64_t mc_ref_cost_base = 0;
-    int64_t mc_flow_base = 0;
-    int row, col;
-
-    if (!tpl_frame->is_valid) continue;
-
-    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
-      for (col = 0; col < cm->mi_cols; ++col) {
-        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
-        intra_cost_base += this_stats->intra_cost;
-        inter_cost_base += this_stats->inter_cost;
-        mc_dep_cost_base += this_stats->mc_dep_cost;
-        mc_ref_cost_base += this_stats->mc_ref_cost;
-        mc_flow_base += this_stats->mc_flow;
-      }
-    }
-
-    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
-    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
-    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
-
-    ++show_frame_count;
-  }
-}
-#endif  // CONFIG_RATE_CTRL
-
-static void setup_tpl_stats(VP9_COMP *cpi) {
-  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
-  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
-  int tpl_group_frames = 0;
-  int frame_idx;
-  cpi->tpl_bsize = BLOCK_32X32;
-
-  init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
-
-  init_tpl_stats(cpi);
-
-  // Backward propagation from tpl_group_frames to 1.
-  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
-    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
-    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
-  }
-#if CONFIG_NON_GREEDY_MV
-  cpi->tpl_ready = 1;
-#if DUMP_TPL_STATS
-  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
-#endif  // DUMP_TPL_STATS
-#endif  // CONFIG_NON_GREEDY_MV
-
-#if CONFIG_RATE_CTRL
-  if (cpi->oxcf.use_simple_encode_api) {
-    accumulate_frame_tpl_stats(cpi);
-  }
-#endif  // CONFIG_RATE_CTRL
-}
-
 void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
                             RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES],
                             int *ref_frame_coding_indexes,
@@ -7663,6 +6373,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) start_timing(cpi, vp9_get_compressed_data_time);
+#endif
+
   if (is_one_pass_svc(cpi)) {
     vp9_one_pass_svc_start_layer(cpi);
   }
@@ -7727,9 +6441,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
         int not_last_frame = (cpi->lookahead->sz - arf_src_index > 1);
         not_last_frame |= ALT_REF_AQ_APPLY_TO_LAST_FRAME;
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, vp9_temporal_filter_time);
+#endif
         // Produce the filtered ARF frame.
         vp9_temporal_filter(cpi, arf_src_index);
         vpx_extend_frame_borders(&cpi->alt_ref_buffer);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, vp9_temporal_filter_time);
+#endif
 
         // for small bitrates segmentation overhead usually
         // eats all bitrate gain from enabling delta quantizers
@@ -7843,7 +6563,13 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if !CONFIG_REALTIME_ONLY
   if ((oxcf->pass == 2) && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    start_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
     vp9_rc_get_second_pass_params(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, vp9_rc_get_second_pass_params_time);
+#endif
   } else if (oxcf->pass == 1) {
     set_frame_size(cpi);
   }
@@ -7864,7 +6590,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     pthread_mutex_init(&cpi->kmeans_mutex, NULL);
 #endif
     CHECK_MEM_ERROR(
-        cm, cpi->kmeans_data_arr,
+        &cm->error, cpi->kmeans_data_arr,
         vpx_calloc(mi_rows * mi_cols, sizeof(*cpi->kmeans_data_arr)));
     cpi->kmeans_data_stride = mi_cols;
     cpi->kmeans_data_arr_alloc = 1;
@@ -7883,13 +6609,19 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 #endif  // CONFIG_NON_GREEDY_MV
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, setup_tpl_stats_time);
+#endif
   if (gf_group_index == 1 &&
       cpi->twopass.gf_group.update_type[gf_group_index] == ARF_UPDATE &&
       cpi->sf.enable_tpl_model) {
-    init_tpl_buffer(cpi);
+    vp9_init_tpl_buffer(cpi);
     vp9_estimate_qp_gop(cpi);
-    setup_tpl_stats(cpi);
+    vp9_setup_tpl_stats(cpi);
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, setup_tpl_stats_time);
+#endif
 
 #if CONFIG_BITSTREAM_DEBUG
   assert(cpi->oxcf.max_threads == 0 &&
@@ -7926,8 +6658,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     cpi->td.mb.inv_txfm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
     vp9_first_pass(cpi, source);
   } else if (oxcf->pass == 2 && !cpi->use_svc) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    // Accumulate 2nd pass time in 2-pass case.
+    start_timing(cpi, Pass2Encode_time);
+#endif
     Pass2Encode(cpi, size, dest, frame_flags, encode_frame_result);
     vp9_twopass_postencode_update(cpi);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+    end_timing(cpi, Pass2Encode_time);
+#endif
   } else if (cpi->use_svc) {
     SvcEncode(cpi, size, dest, frame_flags);
   } else {
@@ -8130,6 +6869,41 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  if (oxcf->pass == 2) end_timing(cpi, vp9_get_compressed_data_time);
+
+  // Print out timing information.
+  // Note: Use "cpi->frame_component_time[0] > 100 us" to avoid showing of
+  // show_existing_frame and lag-in-frames.
+  //  if (cpi->frame_component_time[0] > 100)
+  if (oxcf->pass == 2) {
+    uint64_t frame_total = 0, total = 0;
+    int i;
+
+    fprintf(stderr,
+            "\n Frame number: %d, Frame type: %s, Show Frame: %d, Q: %d\n",
+            cm->current_video_frame, get_frame_type_enum(cm->frame_type),
+            cm->show_frame, cm->base_qindex);
+    for (i = 0; i < kTimingComponents; i++) {
+      cpi->component_time[i] += cpi->frame_component_time[i];
+      // Use vp9_get_compressed_data_time (i = 0) as the total time.
+      if (i == 0) {
+        frame_total = cpi->frame_component_time[0];
+        total = cpi->component_time[0];
+      }
+      fprintf(stderr,
+              " %50s:  %15" PRId64 " us [%6.2f%%] (total: %15" PRId64
+              " us [%6.2f%%])\n",
+              get_component_name(i), cpi->frame_component_time[i],
+              (float)((float)cpi->frame_component_time[i] * 100.0 /
+                      (float)frame_total),
+              cpi->component_time[i],
+              (float)((float)cpi->component_time[i] * 100.0 / (float)total));
+      cpi->frame_component_time[i] = 0;
+    }
+  }
+#endif
+
   if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
@@ -8172,12 +6946,12 @@ int vp9_get_preview_raw_frame(VP9_COMP *cpi, YV12_BUFFER_CONFIG *dest,
   }
 }
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode) {
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode) {
   VP9_COMMON *cm = &cpi->common;
   int hr = 0, hs = 0, vr = 0, vs = 0;
 
-  if (horiz_mode > ONETWO || vert_mode > ONETWO) return -1;
+  if (horiz_mode > VP8E_ONETWO || vert_mode > VP8E_ONETWO) return -1;
 
   Scale2Ratio(horiz_mode, &hr, &hs);
   Scale2Ratio(vert_mode, &vr, &vs);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index cca8b53f8..7136f7faa 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -14,9 +14,11 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
@@ -91,13 +93,6 @@ typedef enum {
 } ENCODE_BREAKOUT_TYPE;
 
 typedef enum {
-  NORMAL = 0,
-  FOURFIVE = 1,
-  THREEFIVE = 2,
-  ONETWO = 3
-} VPX_SCALING;
-
-typedef enum {
   // Good Quality Fast Encoding. The encoder balances quality with the amount of
   // time it takes to encode the output. Speed setting controls how fast.
   GOOD,
@@ -336,15 +331,14 @@ typedef struct TplDepFrame {
 typedef struct TileDataEnc {
   TileInfo tile_info;
   int thresh_freq_fact[BLOCK_SIZES][MAX_MODES];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int thresh_freq_fact_prev[BLOCK_SIZES][MAX_MODES];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int8_t mode_map[BLOCK_SIZES][MAX_MODES];
   FIRSTPASS_DATA fp_data;
   VP9RowMTSync row_mt_sync;
 
   // Used for adaptive_rd_thresh with row multithreading
   int *row_base_thresh_freq_fact;
+  MV firstpass_top_mv;
 } TileDataEnc;
 
 typedef struct RowMTInfo {
@@ -513,6 +507,7 @@ typedef struct EncFrameBuf {
 } EncFrameBuf;
 
 // Maximum operating frame buffer size needed for a GOP using ARF reference.
+// This is used to allocate the memory for TPL stats for a GOP.
 #define MAX_ARF_GOP_SIZE (2 * MAX_LAG_BUFFERS)
 #define MAX_KMEANS_GROUPS 8
 
@@ -659,6 +654,72 @@ static INLINE int get_num_unit_4x4(int size) { return (size + 3) >> 2; }
 static INLINE int get_num_unit_16x16(int size) { return (size + 15) >> 4; }
 #endif  // CONFIG_RATE_CTRL
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+#include "vpx_ports/vpx_timer.h"
+// Adjust the following to add new components.
+typedef enum {
+  vp9_get_compressed_data_time,
+  vp9_temporal_filter_time,
+  vp9_rc_get_second_pass_params_time,
+  setup_tpl_stats_time,
+  Pass2Encode_time,
+
+  encode_with_recode_loop_time,
+  loopfilter_frame_time,
+  vp9_pack_bitstream_time,
+
+  encode_frame_internal_time,
+  rd_pick_partition_time,
+  rd_pick_sb_modes_time,
+  encode_sb_time,
+
+  vp9_rd_pick_inter_mode_sb_time,
+  vp9_rd_pick_inter_mode_sub8x8_time,
+
+  intra_mode_search_time,
+  handle_inter_mode_time,
+  single_motion_search_time,
+  joint_motion_search_time,
+  interp_filter_time,
+
+  kTimingComponents,
+} TIMING_COMPONENT;
+
+static INLINE char const *get_component_name(int index) {
+  switch (index) {
+    case vp9_get_compressed_data_time: return "vp9_get_compressed_data_time";
+    case vp9_temporal_filter_time: return "vp9_temporal_filter_time";
+    case vp9_rc_get_second_pass_params_time:
+      return "vp9_rc_get_second_pass_params_time";
+    case setup_tpl_stats_time: return "setup_tpl_stats_time";
+    case Pass2Encode_time: return "Pass2Encode_time";
+
+    case encode_with_recode_loop_time: return "encode_with_recode_loop_time";
+    case loopfilter_frame_time: return "loopfilter_frame_time";
+    case vp9_pack_bitstream_time: return "vp9_pack_bitstream_time";
+
+    case encode_frame_internal_time: return "encode_frame_internal_time";
+    case rd_pick_partition_time: return "rd_pick_partition_time";
+    case rd_pick_sb_modes_time: return "rd_pick_sb_modes_time";
+    case encode_sb_time: return "encode_sb_time";
+
+    case vp9_rd_pick_inter_mode_sb_time:
+      return "vp9_rd_pick_inter_mode_sb_time";
+    case vp9_rd_pick_inter_mode_sub8x8_time:
+      return "vp9_rd_pick_inter_mode_sub8x8_time";
+
+    case intra_mode_search_time: return "intra_mode_search_time";
+    case handle_inter_mode_time: return "handle_inter_mode_time";
+    case single_motion_search_time: return "single_motion_search_time";
+    case joint_motion_search_time: return "joint_motion_search_time";
+    case interp_filter_time: return "interp_filter_time";
+
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 typedef struct VP9_COMP {
   FRAME_INFO frame_info;
   QUANTS quants;
@@ -685,6 +746,8 @@ typedef struct VP9_COMP {
 
   BLOCK_SIZE tpl_bsize;
   TplDepFrame tpl_stats[MAX_ARF_GOP_SIZE];
+  // Used to store TPL stats before propagation
+  VpxTplGopStats tpl_gop_stats;
   YV12_BUFFER_CONFIG *tpl_recon_frames[REF_FRAMES];
   EncFrameBuf enc_frame_buf[REF_FRAMES];
 #if CONFIG_MULTITHREAD
@@ -784,7 +847,7 @@ typedef struct VP9_COMP {
 
   uint8_t *skin_map;
 
-  // segment threashold for encode breakout
+  // segment threshold for encode breakout
   int segment_encode_breakout[MAX_SEGMENTS];
 
   CYCLIC_REFRESH *cyclic_refresh;
@@ -858,12 +921,15 @@ typedef struct VP9_COMP {
                     // number of MBs in the current frame when the frame is
                     // scaled.
 
+  int last_coded_width;
+  int last_coded_height;
+
   int use_svc;
 
   SVC svc;
 
   // Store frame variance info in SOURCE_VAR_BASED_PARTITION search type.
-  diff *source_diff_var;
+  Diff *source_diff_var;
   // The threshold used in SOURCE_VAR_BASED_PARTITION search type.
   unsigned int source_var_thresh;
   int frames_till_next_var_check;
@@ -973,6 +1039,29 @@ typedef struct VP9_COMP {
   EXT_RATECTRL ext_ratectrl;
 
   int fixed_qp_onepass;
+
+  // Flag to keep track of dynamic change in deadline mode
+  // (good/best/realtime).
+  MODE deadline_mode_previous_frame;
+
+  // Flag to disable scene detection when rtc rate control library is used.
+  int disable_scene_detection_rtc_ratectrl;
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  /*!
+   * component_time[] are initialized to zero while encoder starts.
+   */
+  uint64_t component_time[kTimingComponents];
+  /*!
+   * Stores timing for individual components between calls of start_timing()
+   * and end_timing().
+   */
+  struct vpx_usec_timer component_timer[kTimingComponents];
+  /*!
+   * frame_component_time[] are initialized to zero at beginning of each frame.
+   */
+  uint64_t frame_component_time[kTimingComponents];
+#endif
 } VP9_COMP;
 
 #if CONFIG_RATE_CTRL
@@ -983,7 +1072,7 @@ static INLINE void partition_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->partition_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->partition_info,
                   (PARTITION_INFO *)vpx_calloc(unit_width * unit_height,
                                                sizeof(PARTITION_INFO)));
   memset(cpi->partition_info, 0,
@@ -998,8 +1087,8 @@ static INLINE void free_partition_info(struct VP9_COMP *cpi) {
 }
 
 static INLINE void reset_mv_info(MOTION_VECTOR_INFO *mv_info) {
-  mv_info->ref_frame[0] = NONE;
-  mv_info->ref_frame[1] = NONE;
+  mv_info->ref_frame[0] = NO_REF_FRAME;
+  mv_info->ref_frame[1] = NO_REF_FRAME;
   mv_info->mv[0].as_int = INVALID_MV;
   mv_info->mv[1].as_int = INVALID_MV;
 }
@@ -1011,7 +1100,7 @@ static INLINE void motion_vector_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_4x4(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_4x4(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
   memset(cpi->motion_vector_info, 0,
@@ -1030,7 +1119,7 @@ static INLINE void free_motion_vector_info(struct VP9_COMP *cpi) {
 static INLINE void tpl_stats_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   CHECK_MEM_ERROR(
-      cm, cpi->tpl_stats_info,
+      &cm->error, cpi->tpl_stats_info,
       (TplDepStats *)vpx_calloc(MAX_LAG_BUFFERS, sizeof(TplDepStats)));
   memset(cpi->tpl_stats_info, 0, MAX_LAG_BUFFERS * sizeof(TplDepStats));
 }
@@ -1049,7 +1138,7 @@ static INLINE void fp_motion_vector_info_init(struct VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int unit_width = get_num_unit_16x16(cpi->frame_info.frame_width);
   const int unit_height = get_num_unit_16x16(cpi->frame_info.frame_height);
-  CHECK_MEM_ERROR(cm, cpi->fp_motion_vector_info,
+  CHECK_MEM_ERROR(&cm->error, cpi->fp_motion_vector_info,
                   (MOTION_VECTOR_INFO *)vpx_calloc(unit_width * unit_height,
                                                    sizeof(MOTION_VECTOR_INFO)));
 }
@@ -1154,8 +1243,8 @@ int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
 int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
                        int cols);
 
-int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode,
-                          VPX_SCALING vert_mode);
+int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING_MODE horiz_mode,
+                          VPX_SCALING_MODE vert_mode);
 
 int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height);
@@ -1296,6 +1385,14 @@ void vp9_get_ref_frame_info(FRAME_UPDATE_TYPE update_type, int ref_frame_flags,
 
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv);
 
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst, int bd);
+#else
+void vp9_scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                             YV12_BUFFER_CONFIG *dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 YV12_BUFFER_CONFIG *vp9_svc_twostage_scale(
     VP9_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled,
     YV12_BUFFER_CONFIG *scaled_temp, INTERP_FILTER filter_type,
@@ -1380,9 +1477,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
 
 VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
 
-int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
-                    unsigned int cols, int delta_q[8], int delta_lf[8],
-                    int skip[8], int ref_frame[8]);
+vpx_codec_err_t vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map,
+                                unsigned int rows, unsigned int cols,
+                                int delta_q[8], int delta_lf[8], int skip[8],
+                                int ref_frame[8]);
 
 void vp9_new_framerate(VP9_COMP *cpi, double framerate);
 
@@ -1392,6 +1490,171 @@ int vp9_get_psnr(const VP9_COMP *cpi, PSNR_STATS *psnr);
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+static INLINE void alloc_frame_mvs(VP9_COMMON *const cm, int buffer_idx) {
+  RefCntBuffer *const new_fb_ptr = &cm->buffer_pool->frame_bufs[buffer_idx];
+  if (new_fb_ptr->mvs == NULL || new_fb_ptr->mi_rows < cm->mi_rows ||
+      new_fb_ptr->mi_cols < cm->mi_cols) {
+    vpx_free(new_fb_ptr->mvs);
+    CHECK_MEM_ERROR(&cm->error, new_fb_ptr->mvs,
+                    (MV_REF *)vpx_calloc(cm->mi_rows * cm->mi_cols,
+                                         sizeof(*new_fb_ptr->mvs)));
+    new_fb_ptr->mi_rows = cm->mi_rows;
+    new_fb_ptr->mi_cols = cm->mi_cols;
+  }
+}
+
+static INLINE int mv_cost(const MV *mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
+  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
+  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
+         comp_cost[1][mv->col];
+}
+
+static INLINE int mvsad_err_cost(const MACROBLOCK *x, const MV *mv,
+                                 const MV *ref, int sad_per_bit) {
+  MV diff;
+  diff.row = mv->row - ref->row;
+  diff.col = mv->col - ref->col;
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+static INLINE uint32_t get_start_mv_sad(const MACROBLOCK *x, const MV *mvp_full,
+                                        const MV *ref_mv_full,
+                                        vpx_sad_fn_t sad_fn_ptr, int sadpb) {
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  const uint8_t *const pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  uint32_t start_mv_sad =
+      sad_fn_ptr(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad += mvsad_err_cost(x, mvp_full, ref_mv_full, sadpb);
+
+  return start_mv_sad;
+}
+
+static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
+                                  int subsampling_dim, int blk_dim) {
+  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
+}
+
+// Compute the sum of squares on all visible 4x4s in the transform block.
+static int64_t sum_squares_visible(const MACROBLOCKD *xd,
+                                   const struct macroblockd_plane *const pd,
+                                   const int16_t *diff, const int diff_stride,
+                                   int blk_row, int blk_col,
+                                   const BLOCK_SIZE plane_bsize,
+                                   const BLOCK_SIZE tx_bsize,
+                                   int *visible_width, int *visible_height) {
+  int64_t sse;
+  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
+  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
+  const int b4x4s_to_right_edge = num_4x4_to_edge(
+      plane_4x4_w, xd->mb_to_right_edge, pd->subsampling_x, blk_col);
+  const int b4x4s_to_bottom_edge = num_4x4_to_edge(
+      plane_4x4_h, xd->mb_to_bottom_edge, pd->subsampling_y, blk_row);
+  if (tx_bsize == BLOCK_4X4 ||
+      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
+    assert(tx_4x4_w == tx_4x4_h);
+    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
+    *visible_width = tx_4x4_w << 2;
+    *visible_height = tx_4x4_h << 2;
+  } else {
+    int r, c;
+    const int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
+    const int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
+    sse = 0;
+    // if we are in the unrestricted motion border.
+    for (r = 0; r < max_r; ++r) {
+      // Skip visiting the sub blocks that are wholly within the UMV.
+      for (c = 0; c < max_c; ++c) {
+        sse += (int64_t)vpx_sum_squares_2d_i16(
+            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
+      }
+    }
+    *visible_width = max_c << 2;
+    *visible_height = max_r << 2;
+  }
+  return sse;
+}
+
+// Check if trellis coefficient optimization of the transform block is enabled.
+static INLINE int do_trellis_opt(const struct macroblockd_plane *pd,
+                                 const int16_t *src_diff, int diff_stride,
+                                 int blk_row, int blk_col,
+                                 BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                 void *arg) {
+  const struct encode_b_args *const args = (struct encode_b_args *)arg;
+  const MACROBLOCK *const x = args->x;
+
+  switch (args->enable_trellis_opt) {
+    case DISABLE_TRELLIS_OPT: return 0;
+    case ENABLE_TRELLIS_OPT: return 1;
+    case ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR: {
+      vpx_clear_system_state();
+
+      return (args->trellis_opt_thresh > 0.0)
+                 ? (x->log_block_src_var <= args->trellis_opt_thresh)
+                 : 1;
+    }
+    case ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE: {
+      const MACROBLOCKD *const xd = &x->e_mbd;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int dequant_shift =
+          (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 : 3;
+#else
+      const int dequant_shift = 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      const int qstep = pd->dequant[1] >> dequant_shift;
+      int *sse_calc_done = args->sse_calc_done;
+      int64_t *sse = args->sse;
+      int visible_width = 0, visible_height = 0;
+
+      // TODO: Enable the sf for high bit-depth case
+      if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) || !sse ||
+          !sse_calc_done)
+        return 1;
+
+      *sse = sum_squares_visible(xd, pd, src_diff, diff_stride, blk_row,
+                                 blk_col, plane_bsize, tx_bsize, &visible_width,
+                                 &visible_height);
+      *sse_calc_done = 1;
+
+      vpx_clear_system_state();
+
+      return (*(sse) <= (int64_t)visible_width * visible_height * qstep *
+                            qstep * args->trellis_opt_thresh);
+    }
+    default: assert(0 && "Invalid trellis optimization method."); return 1;
+  }
+}
+
+#if CONFIG_COLLECT_COMPONENT_TIMING
+static INLINE void start_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_start(&cpi->component_timer[component]);
+}
+static INLINE void end_timing(VP9_COMP *cpi, int component) {
+  vpx_usec_timer_mark(&cpi->component_timer[component]);
+  cpi->frame_component_time[component] +=
+      vpx_usec_timer_elapsed(&cpi->component_timer[component]);
+}
+static INLINE char const *get_frame_type_enum(int type) {
+  switch (type) {
+    case 0: return "KEY_FRAME";
+    case 1: return "INTER_FRAME";
+    default: assert(0);
+  }
+  return "error";
+}
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index 453fe2e0d..681996d33 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -94,10 +94,10 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_encode_free_mt_data(cpi);
 
-  CHECK_MEM_ERROR(cm, cpi->workers,
+  CHECK_MEM_ERROR(&cm->error, cpi->workers,
                   vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+  CHECK_MEM_ERROR(&cm->error, cpi->tile_thr_data,
                   vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
 
   for (i = 0; i < num_workers; i++) {
@@ -111,7 +111,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
       thread_data->cpi = cpi;
 
       // Allocate thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td,
                       vpx_memalign(32, sizeof(*thread_data->td)));
       vp9_zero(*thread_data->td);
 
@@ -121,7 +121,7 @@ static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
       vp9_setup_pc_tree(cm, thread_data->td);
 
       // Allocate frame counters in thread data.
-      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+      CHECK_MEM_ERROR(&cm->error, thread_data->td->counts,
                       vpx_calloc(1, sizeof(*thread_data->td->counts)));
 
       // Create threads
@@ -265,6 +265,7 @@ static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
   tile_data->fp_data.intra_count_high += tile_data_t->fp_data.intra_count_high;
   tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
   tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.new_mv_count += tile_data_t->fp_data.new_mv_count;
   tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
   tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
   tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
@@ -292,7 +293,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   {
     int i;
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->mutex,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->mutex,
                     vpx_malloc(sizeof(*row_mt_sync->mutex) * rows));
     if (row_mt_sync->mutex) {
       for (i = 0; i < rows; ++i) {
@@ -300,7 +301,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
       }
     }
 
-    CHECK_MEM_ERROR(cm, row_mt_sync->cond,
+    CHECK_MEM_ERROR(&cm->error, row_mt_sync->cond,
                     vpx_malloc(sizeof(*row_mt_sync->cond) * rows));
     if (row_mt_sync->cond) {
       for (i = 0; i < rows; ++i) {
@@ -310,7 +311,7 @@ void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
   }
 #endif  // CONFIG_MULTITHREAD
 
-  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+  CHECK_MEM_ERROR(&cm->error, row_mt_sync->cur_col,
                   vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
 
   // Set up nsync.
diff --git a/vp9/encoder/vp9_ext_ratectrl.c b/vp9/encoder/vp9_ext_ratectrl.c
index 1d440442b..4664e8c5e 100644
--- a/vp9/encoder/vp9_ext_ratectrl.c
+++ b/vp9/encoder/vp9_ext_ratectrl.c
@@ -8,10 +8,15 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <stddef.h>
+
 #include "vp9/encoder/vp9_ext_ratectrl.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/common/vp9_common.h"
 #include "vpx_dsp/psnr.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
 
 vpx_codec_err_t vp9_extrc_init(EXT_RATECTRL *ext_ratectrl) {
   if (ext_ratectrl == NULL) {
@@ -92,6 +97,7 @@ static void gen_rc_firstpass_stats(const FIRSTPASS_STATS *stats,
   rc_frame_stats->mv_in_out_count = stats->mv_in_out_count;
   rc_frame_stats->duration = stats->duration;
   rc_frame_stats->count = stats->count;
+  rc_frame_stats->new_mv_count = stats->new_mv_count;
 }
 
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
@@ -118,6 +124,21 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
   return VPX_CODEC_OK;
 }
 
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats) {
+  if (ext_ratectrl == NULL) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  if (ext_ratectrl->ready && ext_ratectrl->funcs.send_tpl_gop_stats != NULL) {
+    vpx_rc_status_t rc_status = ext_ratectrl->funcs.send_tpl_gop_stats(
+        ext_ratectrl->model, tpl_gop_stats);
+    if (rc_status == VPX_RC_ERROR) {
+      return VPX_CODEC_ERROR;
+    }
+  }
+  return VPX_CODEC_OK;
+}
+
 static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
   // TODO(angiebird): Add unit test to make sure this function behaves like
   // get_frame_type_from_update_type()
@@ -131,7 +152,6 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
     default:
       fprintf(stderr, "Unsupported update_type %d\n", update_type);
       abort();
-      return 1;
   }
 }
 
diff --git a/vp9/encoder/vp9_ext_ratectrl.h b/vp9/encoder/vp9_ext_ratectrl.h
index 7c3875883..b04580c1d 100644
--- a/vp9/encoder/vp9_ext_ratectrl.h
+++ b/vp9/encoder/vp9_ext_ratectrl.h
@@ -12,6 +12,7 @@
 #define VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
 
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx/vpx_tpl.h"
 #include "vp9/encoder/vp9_firstpass.h"
 
 typedef struct EXT_RATECTRL {
@@ -34,6 +35,9 @@ vpx_codec_err_t vp9_extrc_delete(EXT_RATECTRL *ext_ratectrl);
 vpx_codec_err_t vp9_extrc_send_firstpass_stats(
     EXT_RATECTRL *ext_ratectrl, const FIRST_PASS_INFO *first_pass_info);
 
+vpx_codec_err_t vp9_extrc_send_tpl_stats(EXT_RATECTRL *ext_ratectrl,
+                                         const VpxTplGopStats *tpl_gop_stats);
+
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
     FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index e9250e25c..a9cdf5353 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -152,6 +152,7 @@ static void zero_stats(FIRSTPASS_STATS *section) {
   section->pcnt_intra_high = 0.0;
   section->inactive_zone_rows = 0.0;
   section->inactive_zone_cols = 0.0;
+  section->new_mv_count = 0.0;
   section->MVr = 0.0;
   section->mvr_abs = 0.0;
   section->MVc = 0.0;
@@ -183,6 +184,7 @@ static void accumulate_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high += frame->pcnt_intra_high;
   section->inactive_zone_rows += frame->inactive_zone_rows;
   section->inactive_zone_cols += frame->inactive_zone_cols;
+  section->new_mv_count += frame->new_mv_count;
   section->MVr += frame->MVr;
   section->mvr_abs += frame->mvr_abs;
   section->MVc += frame->MVc;
@@ -212,6 +214,7 @@ static void subtract_stats(FIRSTPASS_STATS *section,
   section->pcnt_intra_high -= frame->pcnt_intra_high;
   section->inactive_zone_rows -= frame->inactive_zone_rows;
   section->inactive_zone_cols -= frame->inactive_zone_cols;
+  section->new_mv_count -= frame->new_mv_count;
   section->MVr -= frame->MVr;
   section->mvr_abs -= frame->mvr_abs;
   section->MVc -= frame->MVc;
@@ -361,7 +364,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_8_mse8x16;
         default: return vpx_highbd_8_mse16x16;
       }
-      break;
     case 10:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_10_mse8x8;
@@ -369,7 +371,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_10_mse8x16;
         default: return vpx_highbd_10_mse16x16;
       }
-      break;
     case 12:
       switch (bsize) {
         case BLOCK_8X8: return vpx_highbd_12_mse8x8;
@@ -377,7 +378,6 @@ static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
         case BLOCK_8X16: return vpx_highbd_12_mse8x16;
         default: return vpx_highbd_12_mse16x16;
       }
-      break;
   }
 }
 
@@ -435,6 +435,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   const BLOCK_SIZE bsize = xd->mi[0]->sb_type;
   vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
+  MV center_mv_full = ref_mv_full;
+  unsigned int start_mv_sad;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
 
   int step_param = 3;
   int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
@@ -455,10 +458,18 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Calculate SAD of the start mv
+  clamp_mv(&ref_mv_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+  start_mv_sad = get_start_mv_sad(x, &ref_mv_full, &center_mv_full,
+                                  cpi->fn_ptr[bsize].sdf, x->sadperbit16);
+  sad_fn_ptr.sdf = cpi->fn_ptr[bsize].sdf;
+  sad_fn_ptr.sdx4df = cpi->fn_ptr[bsize].sdx4df;
+
   // Center the initial step/diamond search on best mv.
-  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                    step_param, x->sadperbit16, &num00,
-                                    &v_fn_ptr, ref_mv);
+  tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad,
+                                    &tmp_mv, step_param, x->sadperbit16, &num00,
+                                    &sad_fn_ptr, ref_mv);
   if (tmp_err < INT_MAX)
     tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
   if (tmp_err < INT_MAX - new_mv_mode_penalty) tmp_err += new_mv_mode_penalty;
@@ -478,9 +489,9 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
     if (num00) {
       --num00;
     } else {
-      tmp_err = cpi->diamond_search_sad(x, &cpi->ss_cfg, &ref_mv_full, &tmp_mv,
-                                        step_param + n, x->sadperbit16, &num00,
-                                        &v_fn_ptr, ref_mv);
+      tmp_err = cpi->diamond_search_sad(
+          x, &cpi->ss_cfg, &ref_mv_full, start_mv_sad, &tmp_mv, step_param + n,
+          x->sadperbit16, &num00, &sad_fn_ptr, ref_mv);
       if (tmp_err < INT_MAX)
         tmp_err = vp9_get_mvpred_var(x, &tmp_mv, ref_mv, &v_fn_ptr, 1);
       if (tmp_err < INT_MAX - new_mv_mode_penalty)
@@ -595,11 +606,11 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
 #define FP_MAX_DN_THRESH 24
 #define KERNEL_SIZE 3
 
-// Baseline Kernal weights for first pass noise metric
-static uint8_t fp_dn_kernal_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
+// Baseline Kernel weights for first pass noise metric
+static uint8_t fp_dn_kernel_3[KERNEL_SIZE * KERNEL_SIZE] = { 1, 2, 1, 2, 4,
                                                              2, 1, 2, 1 };
 
-// Estimate noise at a single point based on the impace of a spatial kernal
+// Estimate noise at a single point based on the impact of a spatial kernel
 // on the point value
 static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int sum_weight = 0;
@@ -609,23 +620,23 @@ static int fp_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int diff;
   int dn_diff;
   uint8_t *tmp_ptr;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint8_t dn_val;
   uint8_t centre_val = *src_ptr;
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     for (j = 0; j < KERNEL_SIZE; ++j) {
       diff = abs((int)centre_val - (int)tmp_ptr[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -651,13 +662,13 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
   int dn_diff;
   uint8_t *tmp_ptr;
   uint16_t *tmp_ptr16;
-  uint8_t *kernal_ptr;
+  uint8_t *kernel_ptr;
   uint16_t dn_val;
   uint16_t centre_val = *CONVERT_TO_SHORTPTR(src_ptr);
 
-  kernal_ptr = fp_dn_kernal_3;
+  kernel_ptr = fp_dn_kernel_3;
 
-  // Apply the kernal
+  // Apply the kernel
   tmp_ptr = src_ptr - stride - 1;
   for (i = 0; i < KERNEL_SIZE; ++i) {
     tmp_ptr16 = CONVERT_TO_SHORTPTR(tmp_ptr);
@@ -665,10 +676,10 @@ static int fp_highbd_estimate_point_noise(uint8_t *src_ptr, const int stride) {
       diff = abs((int)centre_val - (int)tmp_ptr16[j]);
       max_diff = VPXMAX(max_diff, diff);
       if (diff <= FP_DN_THRESH) {
-        sum_weight += *kernal_ptr;
-        sum_val += (int)tmp_ptr16[j] * (int)*kernal_ptr;
+        sum_weight += *kernel_ptr;
+        sum_val += (int)tmp_ptr16[j] * (int)*kernel_ptr;
       }
-      ++kernal_ptr;
+      ++kernel_ptr;
     }
     tmp_ptr += stride;
   }
@@ -793,6 +804,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
   fps->inactive_zone_cols = (double)0;
 
   if (fp_acc_data->mvcount > 0) {
+    fps->new_mv_count = (double)(fp_acc_data->new_mv_count) / num_mbs;
     fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
     fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
     fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
@@ -809,6 +821,7 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
         (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
     fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
   } else {
+    fps->new_mv_count = 0.0;
     fps->MVr = 0.0;
     fps->mvr_abs = 0.0;
     fps->MVc = 0.0;
@@ -834,6 +847,7 @@ static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
   this_tile->fp_data.intra_count_low += fp_acc_data->intra_count_low;
   this_tile->fp_data.intra_count_high += fp_acc_data->intra_count_high;
   this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.new_mv_count += fp_acc_data->new_mv_count;
   this_tile->fp_data.mvcount += fp_acc_data->mvcount;
   this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
   this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
@@ -904,6 +918,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
   double mb_neutral_count;
   int scaled_low_intra_thresh = scale_sse_threshold(cm, LOW_I_THRESH);
 
+  MV *first_top_mv = &tile_data->firstpass_top_mv;
+  MV last_nonzero_mv = { 0, 0 };
+
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert(frame_is_intra_only(cm) || (lst_yv12 != NULL));
@@ -944,6 +961,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
     (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c);
 
+    if (mb_col == mb_col_start) {
+      last_nonzero_mv = *first_top_mv;
+    }
+
     // Adjust to the next column of MBs.
     x->plane[0].src.buf = cpi->Source->y_buffer +
                           mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
@@ -1253,7 +1274,7 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
         xd->mi[0]->mv[0].as_mv = mv;
         xd->mi[0]->tx_size = TX_4X4;
         xd->mi[0]->ref_frame[0] = LAST_FRAME;
-        xd->mi[0]->ref_frame[1] = NONE;
+        xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
         vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
         vp9_encode_sby_pass1(x, bsize);
         fp_acc_data->sum_mvr += mv.row;
@@ -1268,6 +1289,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
 
         if (!is_zero_mv(&mv)) {
           ++(fp_acc_data->mvcount);
+          if (!is_equal_mv(&mv, &last_nonzero_mv)) {
+            ++(fp_acc_data->new_mv_count);
+          }
+          last_nonzero_mv = mv;
 
           // Does the row vector point inwards or outwards?
           if (mb_row < cm->mb_rows / 2) {
@@ -1323,6 +1348,9 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
     }
     fp_acc_data->coded_error += (int64_t)this_error;
 
+    if (mb_col == mb_col_start) {
+      *first_top_mv = last_nonzero_mv;
+    }
     recon_yoffset += 16;
     recon_uvoffset += uv_mb_height;
 
@@ -1345,7 +1373,7 @@ static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
   MV best_ref_mv;
   // Tiling is ignored in the first pass.
   vp9_tile_init(tile, cm, 0, 0);
-
+  tile_data.firstpass_top_mv = zero_mv;
 #if CONFIG_RATE_CTRL
   if (cpi->oxcf.use_simple_encode_api) {
     fp_motion_vector_info_reset(cpi->frame_info.frame_width,
@@ -1411,7 +1439,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
 
   if (cpi->row_mt_bit_exact && cpi->twopass.fp_mb_float_stats == NULL)
     CHECK_MEM_ERROR(
-        cm, cpi->twopass.fp_mb_float_stats,
+        &cm->error, cpi->twopass.fp_mb_float_stats,
         vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
 
   {
@@ -1437,7 +1465,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
       first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
-    // Dont allow a value of 0 for duration.
+    // Don't allow a value of 0 for duration.
     // (Section duration is also defaulted to minimum of 1.0).
     fps.duration = VPXMAX(1.0, (double)(source->ts_end - source->ts_start));
 
@@ -1447,7 +1475,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     accumulate_stats(&twopass->total_stats, &fps);
   }
 
-  // Copy the previous Last Frame back into gf and and arf buffers if
+  // Copy the previous Last Frame back into gf and arf buffers if
   // the prediction is good enough... but also don't allow it to lag too far.
   if ((twopass->sr_update_lag > 3) ||
       ((cm->current_video_frame > 0) &&
@@ -1476,22 +1504,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
                cm->ref_frame_map[cpi->lst_fb_idx]);
   }
 
-  // Use this to see what the first pass reconstruction looks like.
-  if (0) {
-    char filename[512];
-    FILE *recon_file;
-    snprintf(filename, sizeof(filename), "enc%04d.yuv",
-             (int)cm->current_video_frame);
-
-    if (cm->current_video_frame == 0)
-      recon_file = fopen(filename, "wb");
-    else
-      recon_file = fopen(filename, "ab");
-
-    (void)fwrite(lst_yv12->buffer_alloc, lst_yv12->frame_size, 1, recon_file);
-    fclose(recon_file);
-  }
-
   // In the first pass, every frame is considered as a show frame.
   update_frame_indexes(cm, /*show_frame=*/1);
   if (cpi->use_svc) vp9_inc_frame_in_layer(cpi);
@@ -1664,7 +1676,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
 
   // Scan the first pass file and calculate a modified score for each
   // frame that is used to distribute bits. The modified score is assumed
-  // to provide a linear basis for bit allocation. I.e a frame A with a score
+  // to provide a linear basis for bit allocation. I.e., a frame A with a score
   // that is double that of frame B will be allocated 2x as many bits.
   {
     double modified_score_total = 0.0;
@@ -1689,8 +1701,8 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
     }
 
     // Second scan using clamps based on the previous cycle average.
-    // This may modify the total and average somewhat but we dont bother with
-    // further itterations.
+    // This may modify the total and average somewhat but we don't bother with
+    // further iterations.
     modified_score_total = 0.0;
     s = twopass->stats_in;
     while (s < twopass->stats_in_end) {
@@ -1847,7 +1859,7 @@ static int detect_flash_from_frame_stats(const FIRSTPASS_STATS *frame_stats) {
   // brief break in prediction (such as a flash) but subsequent frames
   // are reasonably well predicted by an earlier (pre flash) frame.
   // The recovery after a flash is indicated by a high pcnt_second_ref
-  // useage or a second ref coded error notabley lower than the last
+  // usage or a second ref coded error notabley lower than the last
   // frame coded error.
   if (frame_stats == NULL) {
     return 0;
@@ -2027,7 +2039,7 @@ static int compute_arf_boost(const FRAME_INFO *frame_info,
         this_frame, &this_frame_mv_in_out, &mv_in_out_accumulator,
         &abs_mv_in_out_accumulator, &mv_ratio_accumulator);
 
-    // We want to discount the the flash frame itself and the recovery
+    // We want to discount the flash frame itself and the recovery
     // frame that follows as both will have poor scores.
     flash_detected = detect_flash_from_frame_stats(this_frame) ||
                      detect_flash_from_frame_stats(next_frame);
@@ -2158,7 +2170,7 @@ static double calculate_group_score(VP9_COMP *cpi, double av_score,
   double score_total = 0.0;
   int i = 0;
 
-  // We dont ever want to return a 0 score here.
+  // We don't ever want to return a 0 score here.
   if (frame_count == 0) return 1.0;
 
   while ((i < frame_count) && (s < twopass->stats_in_end)) {
@@ -2492,7 +2504,7 @@ static int get_gop_coding_frame_num(
     int *use_alt_ref, const FRAME_INFO *frame_info,
     const TWO_PASS *const twopass, const RATE_CONTROL *rc,
     int gf_start_show_idx, const RANGE *active_gf_interval,
-    double gop_intra_factor, int lag_in_frames) {
+    double gop_intra_factor, int lag_in_frames, int *end_of_sequence) {
   const FIRST_PASS_INFO *first_pass_info = &twopass->first_pass_info;
   double loop_decay_rate = 1.00;
   double mv_ratio_accumulator = 0.0;
@@ -2518,6 +2530,7 @@ static int get_gop_coding_frame_num(
     next_frame = fps_get_frame_stats(first_pass_info,
                                      gf_start_show_idx + gop_coding_frames);
     if (next_frame == NULL) {
+      *end_of_sequence = gop_coding_frames == 1 && rc->source_alt_ref_active;
       break;
     }
 
@@ -2586,7 +2599,7 @@ static int get_gop_coding_frame_num(
     if (
         // Don't break out with a very short interval.
         (gop_coding_frames >= active_gf_interval->min) &&
-        // If possible dont break very close to a kf
+        // If possible don't break very close to a kf
         ((rc->frames_to_key - gop_coding_frames) >= rc->min_gf_interval) &&
         (gop_coding_frames & 0x01) && (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
@@ -2708,6 +2721,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   double gop_intra_factor;
   int gop_frames;
   RANGE active_gf_interval;
+  // Whether this is at the end of last GOP of this sequence.
+  int end_of_sequence = 0;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -2739,7 +2754,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
 
   gop_coding_frames = get_gop_coding_frame_num(
       &use_alt_ref, frame_info, twopass, rc, gf_start_show_idx,
-      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames);
+      &active_gf_interval, gop_intra_factor, cpi->oxcf.lag_in_frames,
+      &end_of_sequence);
   use_alt_ref &= allow_alt_ref;
 #if CONFIG_RATE_CTRL
   // If the external gop_command is on, we will override the decisions
@@ -2757,7 +2773,8 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
   // will be overwritten.
   if (cpi->ext_ratectrl.ready &&
-      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0 &&
+      cpi->ext_ratectrl.funcs.get_gop_decision != NULL && !end_of_sequence) {
     vpx_codec_err_t codec_status;
     vpx_rc_gop_decision_t gop_decision;
     vpx_rc_gop_info_t gop_info;
@@ -3020,7 +3037,7 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
       next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error);
 
   // Return true the intra/inter ratio for the current frame is
-  // low but better in the next and previous frame and the relative useage of
+  // low but better in the next and previous frame and the relative usage of
   // intra in the current frame is markedly higher than the last and next frame.
   if ((this_ii_ratio < 2.0) && (last_ii_ratio > 2.25) &&
       (next_ii_ratio > 2.25) && (this_pcnt_intra > (3 * last_pcnt_intra)) &&
@@ -3041,8 +3058,8 @@ static int intra_step_transition(const FIRSTPASS_STATS *this_frame,
 // Minimum % intra coding observed in first pass (1.0 = 100%)
 #define MIN_INTRA_LEVEL 0.25
 // Threshold for use of the lagging second reference frame. Scene cuts do not
-// usually have a high second ref useage.
-#define SECOND_REF_USEAGE_THRESH 0.2
+// usually have a high second ref usage.
+#define SECOND_REF_USAGE_THRESH 0.2
 // Hard threshold where the first pass chooses intra for almost all blocks.
 // In such a case even if the frame is not a scene cut coding a key frame
 // may be a good option.
@@ -3072,7 +3089,7 @@ static int test_candidate_kf(const FIRST_PASS_INFO *first_pass_info,
   detect_flash_from_frame_stats(next_frame);
   if (!detect_flash_from_frame_stats(this_frame) &&
       !detect_flash_from_frame_stats(next_frame) &&
-      (this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
+      (this_frame->pcnt_second_ref < SECOND_REF_USAGE_THRESH) &&
       ((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
        (slide_transition(this_frame, last_frame, next_frame)) ||
        (intra_step_transition(this_frame, last_frame, next_frame)) ||
@@ -3350,7 +3367,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
 
       // The second (lagging) ref error is not valid immediately after
       // a key frame because either the lag has not built up (in the case of
-      // the first key frame or it points to a refernce before the new key
+      // the first key frame or it points to a reference before the new key
       // frame.
       if (i < 2) sr_accumulator = 0.0;
       frame_boost =
@@ -3380,7 +3397,7 @@ static void find_next_key_frame(VP9_COMP *cpi, int kf_show_idx) {
   twopass->key_frame_section_intra_rating = calculate_section_intra_ratio(
       start_position, twopass->stats_in_end, rc->frames_to_key);
 
-  // Special case for static / slide show content but dont apply
+  // Special case for static / slide show content but don't apply
   // if the kf group is very short.
   if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
     rc->kf_boost = (int)(twopass->kf_max_total_boost);
@@ -3494,8 +3511,8 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   const int show_idx = cm->current_video_frame;
 
-  if (cpi->common.current_frame_coding_index == 0) {
-    VP9_COMMON *cm = &cpi->common;
+  if (cpi->common.current_frame_coding_index == 0 &&
+      cpi->ext_ratectrl.funcs.send_firstpass_stats != NULL) {
     const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
         &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
     if (codec_status != VPX_CODEC_OK) {
@@ -3513,7 +3530,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
     vp9_init_vizier_params(twopass, screen_area);
   }
 
-  // If this is an arf frame then we dont want to read the stats file or
+  // If this is an arf frame then we don't want to read the stats file or
   // advance the input pointer as we already have what we need.
   if (gf_group->update_type[gf_group->index] == ARF_UPDATE) {
     int target_rate;
@@ -3792,6 +3809,7 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
   const int arf_active_or_kf = last_gop_use_alt_ref || first_is_key_frame;
   RANGE active_gf_interval;
   int arf_layers;
+  int end_of_sequence = 0;
   if (oxcf->use_simple_encode_api) {
     active_gf_interval = get_active_gf_inverval_range_simple(
         rc->min_gf_interval, arf_active_or_kf, rc->frames_to_key);
@@ -3809,9 +3827,9 @@ int vp9_get_gop_coding_frame_count(const VP9EncoderConfig *oxcf,
     gop_intra_factor = 1.0;
   }
 
-  frame_count = get_gop_coding_frame_num(use_alt_ref, frame_info, twopass, rc,
-                                         show_idx, &active_gf_interval,
-                                         gop_intra_factor, oxcf->lag_in_frames);
+  frame_count = get_gop_coding_frame_num(
+      use_alt_ref, frame_info, twopass, rc, show_idx, &active_gf_interval,
+      gop_intra_factor, oxcf->lag_in_frames, &end_of_sequence);
   *use_alt_ref &= allow_alt_ref;
   return frame_count;
 }
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index cdcf56872..a19b04db7 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "vp9/common/vp9_onyxc_int.h"
+#include "vp9/encoder/vp9_firstpass_stats.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
@@ -55,37 +56,9 @@ typedef struct {
   int64_t sum_mvcs;
   int sum_in_vectors;
   int intra_smooth_count;
+  int new_mv_count;
 } FIRSTPASS_DATA;
 
-typedef struct {
-  double frame;
-  double weight;
-  double intra_error;
-  double coded_error;
-  double sr_coded_error;
-  double frame_noise_energy;
-  double pcnt_inter;
-  double pcnt_motion;
-  double pcnt_second_ref;
-  double pcnt_neutral;
-  double pcnt_intra_low;   // Coded intra but low variance
-  double pcnt_intra_high;  // Coded intra high variance
-  double intra_skip_pct;
-  double intra_smooth_pct;    // % of blocks that are smooth
-  double inactive_zone_rows;  // Image mask rows top and bottom.
-  double inactive_zone_cols;  // Image mask columns at left and right edges.
-  double MVr;
-  double mvr_abs;
-  double MVc;
-  double mvc_abs;
-  double MVrv;
-  double MVcv;
-  double mv_in_out_count;
-  double duration;
-  double count;
-  int64_t spatial_layer_id;
-} FIRSTPASS_STATS;
-
 typedef enum {
   KF_UPDATE = 0,
   LF_UPDATE = 1,
diff --git a/vp9/encoder/vp9_firstpass_stats.h b/vp9/encoder/vp9_firstpass_stats.h
new file mode 100644
index 000000000..01928e781
--- /dev/null
+++ b/vp9/encoder/vp9_firstpass_stats.h
@@ -0,0 +1,54 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+#define VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double frame;
+  double weight;
+  double intra_error;
+  double coded_error;
+  double sr_coded_error;
+  double frame_noise_energy;
+  double pcnt_inter;
+  double pcnt_motion;
+  double pcnt_second_ref;
+  double pcnt_neutral;
+  double pcnt_intra_low;   // Coded intra but low variance
+  double pcnt_intra_high;  // Coded intra high variance
+  double intra_skip_pct;
+  double intra_smooth_pct;    // % of blocks that are smooth
+  double inactive_zone_rows;  // Image mask rows top and bottom.
+  double inactive_zone_cols;  // Image mask columns at left and right edges.
+  double MVr;
+  double mvr_abs;
+  double MVc;
+  double mvc_abs;
+  double MVrv;
+  double MVcv;
+  double mv_in_out_count;
+  double duration;
+  double count;
+  double new_mv_count;
+  int64_t spatial_layer_id;
+} FIRSTPASS_STATS;
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_FIRSTPASS_STATS_H_
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index a410d0407..ba550a1d6 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -12,6 +12,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vp9/encoder/vp9_encoder.h"
 #include "vpx_dsp/vpx_filter.h"
 #include "vpx_scale/yv12config.h"
 
@@ -91,6 +92,23 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
   {
     const int dst_w = dst->y_crop_width;
     const int dst_h = dst->y_crop_height;
+
+    // The issue b/311394513 reveals a corner case bug. vpx_scaled_2d() requires
+    // both x_step_q4 and y_step_q4 are less than or equal to 64. Otherwise, it
+    // needs to call vp9_scale_and_extend_frame_nonnormative() that supports
+    // arbitrary scaling.
+    const int x_step_q4 = 16 * src_w / dst_w;
+    const int y_step_q4 = 16 * src_h / dst_h;
+    if (x_step_q4 > 64 || y_step_q4 > 64) {
+      // This function is only called while cm->bit_depth is VPX_BITS_8.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vp9_scale_and_extend_frame_nonnormative(src, dst, (int)VPX_BITS_8);
+#else
+      vp9_scale_and_extend_frame_nonnormative(src, dst);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      return;
+    }
+
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       const int factor = (i == 0 || i == 3 ? 1 : 2);
       const int src_stride = src_strides[i];
diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 7c2790cb9..2f20a8fe6 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c
@@ -98,8 +98,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv,
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
-    unsigned int tmp_err;
-    MV zero_ref_mv = { 0, 0 }, tmp_mv;
+    MV zero_ref_mv = { 0, 0 };
 
     tmp_err =
         do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv, mb_row, mb_col);
@@ -238,7 +237,7 @@ static void update_mbgraph_frame_stats(VP9_COMP *cpi,
   xd->mi[0] = &mi_local;
   mi_local.sb_type = BLOCK_16X16;
   mi_local.ref_frame[0] = LAST_FRAME;
-  mi_local.ref_frame[1] = NONE;
+  mi_local.ref_frame[1] = NO_REF_FRAME;
 
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
@@ -289,7 +288,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
   int *arf_not_zz;
 
   CHECK_MEM_ERROR(
-      cm, arf_not_zz,
+      &cm->error, arf_not_zz,
       vpx_calloc(cm->mb_rows * cm->mb_cols * sizeof(*arf_not_zz), 1));
 
   // We are not interested in results beyond the alt ref itself.
@@ -334,23 +333,16 @@ static void separate_arf_mbs(VP9_COMP *cpi) {
     }
   }
 
-  // Only bother with segmentation if over 10% of the MBs in static segment
-  // if ( ncnt[1] && (ncnt[0] / ncnt[1] < 10) )
-  if (1) {
-    // Note % of blocks that are marked as static
-    if (cm->MBs)
-      cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
+  // Note % of blocks that are marked as static
+  if (cm->MBs)
+    cpi->static_mb_pct = (ncnt[1] * 100) / (cm->mi_rows * cm->mi_cols);
 
-    // This error case should not be reachable as this function should
-    // never be called with the common data structure uninitialized.
-    else
-      cpi->static_mb_pct = 0;
-
-    vp9_enable_segmentation(&cm->seg);
-  } else {
+  // This error case should not be reachable as this function should
+  // never be called with the common data structure uninitialized.
+  else
     cpi->static_mb_pct = 0;
-    vp9_disable_segmentation(&cm->seg);
-  }
+
+  vp9_enable_segmentation(&cm->seg);
 
   // Free localy allocated storage
   vpx_free(arf_not_zz);
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 1f08aa5de..cbe1c4029 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -77,14 +77,6 @@ int vp9_init_search_range(int size) {
   return sr;
 }
 
-static INLINE int mv_cost(const MV *mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  assert(mv->row >= -MV_MAX && mv->row < MV_MAX);
-  assert(mv->col >= -MV_MAX && mv->col < MV_MAX);
-  return joint_cost[vp9_get_mv_joint(mv)] + comp_cost[0][mv->row] +
-         comp_cost[1][mv->col];
-}
-
 int vp9_mv_bit_cost(const MV *mv, const MV *ref, const int *mvjcost,
                     int *mvcost[2], int weight) {
   const MV diff = { mv->row - ref->row, mv->col - ref->col };
@@ -103,15 +95,6 @@ static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
   }
   return 0;
 }
-
-static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int sad_per_bit) {
-  const MV diff = { mv->row - ref->row, mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
 void vp9_init_dsmotion_compensation(search_site_config *cfg, int stride) {
   int len;
   int ss_count = 0;
@@ -163,8 +146,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
   do {                                                                         \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
       int64_t tmpmse;                                                          \
-      const MV mv = { r, c };                                                  \
-      const MV ref_mv = { rr, rc };                                            \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
       if (second_pred == NULL) {                                               \
         thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                            src_stride, &sse);                                  \
@@ -173,7 +156,8 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
                             src_stride, &sse, second_pred);                    \
       }                                                                        \
       tmpmse = thismse;                                                        \
-      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
+      tmpmse +=                                                                \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);     \
       if (tmpmse >= INT_MAX) {                                                 \
         v = INT_MAX;                                                           \
       } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
@@ -192,15 +176,16 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) {
 #define CHECK_BETTER(v, r, c)                                                  \
   do {                                                                         \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-      const MV mv = { r, c };                                                  \
-      const MV ref_mv = { rr, rc };                                            \
+      const MV cb_mv = { r, c };                                               \
+      const MV cb_ref_mv = { rr, rc };                                         \
       if (second_pred == NULL)                                                 \
         thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
                            src_stride, &sse);                                  \
       else                                                                     \
         thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
                             src_stride, &sse, second_pred);                    \
-      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,                \
+                           error_per_bit) +                                    \
                thismse) < besterr) {                                           \
         besterr = v;                                                           \
         br = r;                                                                \
@@ -312,7 +297,7 @@ static unsigned int setup_center_error(
       besterr =
           vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride, sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -327,7 +312,7 @@ static unsigned int setup_center_error(
   uint32_t besterr;
   (void)xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -650,7 +635,7 @@ static int accurate_sub_pel_search(
     vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                               0, kernel, MV_PRECISION_Q3, 0, 0);
     if (second_pred != NULL) {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
       besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
     } else {
@@ -669,7 +654,7 @@ static int accurate_sub_pel_search(
   vp9_build_inter_predictor(pre_address, y_stride, pred, w, this_mv, sf, w, h,
                             0, kernel, MV_PRECISION_Q3, 0, 0);
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[64 * 64]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, pred, w);
     besterr = vfp->vf(comp_pred, w, src_address, src_stride, sse);
   } else {
@@ -686,13 +671,14 @@ static int accurate_sub_pel_search(
   do {                                                                        \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
       int64_t tmpmse;                                                         \
-      const MV mv = { r, c };                                                 \
-      const MV ref_mv = { rr, rc };                                           \
-      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
                                         src_stride, y, y_stride, second_pred, \
                                         w, h, &sse);                          \
       tmpmse = thismse;                                                       \
-      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);    \
+      tmpmse +=                                                               \
+          mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost, error_per_bit);    \
       if (tmpmse >= INT_MAX) {                                                \
         v = INT_MAX;                                                          \
       } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
@@ -711,12 +697,13 @@ static int accurate_sub_pel_search(
 #define CHECK_BETTER1(v, r, c)                                                \
   do {                                                                        \
     if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
-      const MV mv = { r, c };                                                 \
-      const MV ref_mv = { rr, rc };                                           \
-      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+      const MV cb_mv = { r, c };                                              \
+      const MV cb_ref_mv = { rr, rc };                                        \
+      thismse = accurate_sub_pel_search(xd, &cb_mv, x->me_sf, kernel, vfp, z, \
                                         src_stride, y, y_stride, second_pred, \
                                         w, h, &sse);                          \
-      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +    \
+      if ((v = mv_err_cost(&cb_mv, &cb_ref_mv, mvjcost, mvcost,               \
+                           error_per_bit) +                                   \
                thismse) < besterr) {                                          \
         besterr = v;                                                          \
         br = r;                                                               \
@@ -966,7 +953,7 @@ static INLINE int is_mv_in(const MvLimits *mv_limits, const MV *mv) {
   }
 
 #define MAX_PATTERN_SCALES 11
-#define MAX_PATTERN_CANDIDATES 8  // max number of canddiates per scale
+#define MAX_PATTERN_CANDIDATES 8  // max number of candidates per scale
 #define PATTERN_CANDIDATES_REF 3  // number of refinement candidates
 
 // Calculate and return a sad+mvcost list around an integer best pel.
@@ -980,16 +967,14 @@ static INLINE void calc_int_cost_list(const MACROBLOCK *x, const MV *ref_mv,
   const MV fcenter_mv = { ref_mv->row >> 3, ref_mv->col >> 3 };
   int br = best_mv->row;
   int bc = best_mv->col;
-  MV this_mv;
+  const MV mv = { br, bc };
   int i;
   unsigned int sse;
 
-  this_mv.row = br;
-  this_mv.col = bc;
   cost_list[0] =
-      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &this_mv),
+      fn_ptr->vf(what->buf, what->stride, get_buf_from_mv(in_what, &mv),
                  in_what->stride, &sse) +
-      mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+      mvsad_err_cost(x, &mv, &fcenter_mv, sadpb);
   if (check_bounds(&x->mv_limits, br, bc, 1)) {
     for (i = 0; i < 4; i++) {
       const MV this_mv = { br + neighbors[i].row, bc + neighbors[i].col };
@@ -1049,7 +1034,7 @@ static int vp9_pattern_search(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -1170,6 +1155,9 @@ static int vp9_pattern_search(
     } while (s--);
   }
 
+  best_mv->row = br;
+  best_mv->col = bc;
+
   // Returns the one-away integer pel sad values around the best as follows:
   // cost_list[0]: cost at the best integer pel
   // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
@@ -1177,11 +1165,8 @@ static int vp9_pattern_search(
   // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
   // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    const MV best_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
+    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, best_mv, cost_list);
   }
-  best_mv->row = br;
-  best_mv->col = bc;
   return bestsad;
 }
 
@@ -1223,7 +1208,7 @@ static int vp9_pattern_search_sad(
                      in_what->stride) +
             mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
 
-  // Search all possible scales upto the search param around the center point
+  // Search all possible scales up to the search param around the center point
   // pick the scale of the point that is best as the starting scale of
   // further steps around it.
   if (do_init_search) {
@@ -2068,9 +2053,9 @@ int vp9_prepare_nb_full_mvs(const MotionField *motion_field, int mi_row,
 #endif  // CONFIG_NON_GREEDY_MV
 
 int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
+                             MV *ref_mv, uint32_t start_mv_sad, MV *best_mv,
+                             int search_param, int sad_per_bit, int *num00,
+                             const vp9_sad_fn_ptr_t *sad_fn_ptr,
                              const MV *center_mv) {
   int i, j, step;
 
@@ -2081,7 +2066,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int in_what_stride = xd->plane[0].pre[0].stride;
   const uint8_t *best_address;
 
-  unsigned int bestsad = INT_MAX;
+  unsigned int bestsad = start_mv_sad;
   int best_site = -1;
   int last_site = -1;
 
@@ -2099,8 +2084,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   const int tot_steps = cfg->total_steps - search_param;
 
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
-  clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max,
-           x->mv_limits.row_min, x->mv_limits.row_max);
   ref_row = ref_mv->row;
   ref_col = ref_mv->col;
   *num00 = 0;
@@ -2111,10 +2094,6 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
   in_what = xd->plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
   best_address = in_what;
 
-  // Check the starting position
-  bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride) +
-            mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
-
   i = 0;
 
   for (step = 0; step < tot_steps; step++) {
@@ -2138,8 +2117,8 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
 
         for (t = 0; t < 4; t++) block_offset[t] = ss_os[i + t] + best_address;
 
-        fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
-                       sad_array);
+        sad_fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride,
+                           sad_array);
 
         for (t = 0; t < 4; t++, i++) {
           if (sad_array[t] < bestsad) {
@@ -2163,7 +2142,7 @@ int vp9_diamond_search_sad_c(const MACROBLOCK *x, const search_site_config *cfg,
         if (is_mv_in(&x->mv_limits, &this_mv)) {
           const uint8_t *const check_here = ss_os[i] + best_address;
           unsigned int thissad =
-              fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
+              sad_fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
           if (thissad < bestsad) {
             thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
@@ -2321,17 +2300,16 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
   // TODO(jingning): Implement integral projection functions for high bit-depth
   // setting and remove this part of code.
   if (xd->bd != 8) {
-    unsigned int this_sad;
+    const unsigned int sad = cpi->fn_ptr[bsize].sdf(
+        x->plane[0].src.buf, src_stride, xd->plane[0].pre[0].buf, ref_stride);
     tmp_mv->row = 0;
     tmp_mv->col = 0;
-    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                      xd->plane[0].pre[0].buf, ref_stride);
 
     if (scaled_ref_frame) {
       int i;
       for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
     }
-    return this_sad;
+    return sad;
   }
 #endif
 
@@ -2506,15 +2484,54 @@ int vp9_full_pixel_diamond_new(const VP9_COMP *cpi, MACROBLOCK *x,
               point as the best match, we will do a final 1-away diamond
               refining search  */
 static int full_pixel_diamond(const VP9_COMP *const cpi,
-                              const MACROBLOCK *const x, MV *mvp_full,
-                              int step_param, int sadpb, int further_steps,
-                              int do_refine, int *cost_list,
+                              const MACROBLOCK *const x, BLOCK_SIZE bsize,
+                              MV *mvp_full, int step_param, int sadpb,
+                              int further_steps, int do_refine,
+                              int use_downsampled_sad, int *cost_list,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *ref_mv, MV *dst_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
-  int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param, sadpb, &n, fn_ptr, ref_mv);
+  int bestsme;
+  const int src_buf_stride = x->plane[0].src.stride;
+  const uint8_t *const src_buf = x->plane[0].src.buf;
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const int pred_buf_stride = xd->plane[0].pre[0].stride;
+  uint8_t *pred_buf;
+  vp9_sad_fn_ptr_t sad_fn_ptr;
+  unsigned int start_mv_sad, start_mv_sad_even_rows, start_mv_sad_odd_rows;
+  const MV ref_mv_full = { ref_mv->row >> 3, ref_mv->col >> 3 };
+  clamp_mv(mvp_full, x->mv_limits.col_min, x->mv_limits.col_max,
+           x->mv_limits.row_min, x->mv_limits.row_max);
+
+  pred_buf =
+      xd->plane[0].pre[0].buf + mvp_full->row * pred_buf_stride + mvp_full->col;
+  start_mv_sad_even_rows =
+      fn_ptr->sdsf(src_buf, src_buf_stride, pred_buf, pred_buf_stride);
+  start_mv_sad_odd_rows =
+      fn_ptr->sdsf(src_buf + src_buf_stride, src_buf_stride,
+                   pred_buf + pred_buf_stride, pred_buf_stride);
+  start_mv_sad = (start_mv_sad_even_rows + start_mv_sad_odd_rows) >> 1;
+  start_mv_sad += mvsad_err_cost(x, mvp_full, &ref_mv_full, sadpb);
+
+  sad_fn_ptr.sdf = fn_ptr->sdf;
+  sad_fn_ptr.sdx4df = fn_ptr->sdx4df;
+  if (use_downsampled_sad && num_4x4_blocks_high_lookup[bsize] >= 2) {
+    // If the absolute difference between the pred-to-src SAD of even rows and
+    // the pred-to-src SAD of odd rows is small, skip every other row in sad
+    // computation.
+    const int odd_to_even_diff_sad =
+        abs((int)start_mv_sad_even_rows - (int)start_mv_sad_odd_rows);
+    const int mult_thresh = 10;
+    if (odd_to_even_diff_sad * mult_thresh < (int)start_mv_sad_even_rows) {
+      sad_fn_ptr.sdf = fn_ptr->sdsf;
+      sad_fn_ptr.sdx4df = fn_ptr->sdsx4df;
+    }
+  }
+
+  bestsme =
+      cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad, &temp_mv,
+                              step_param, sadpb, &n, &sad_fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
   *dst_mv = temp_mv;
@@ -2529,9 +2546,9 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     if (num00) {
       num00--;
     } else {
-      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
-                                        step_param + n, sadpb, &num00, fn_ptr,
-                                        ref_mv);
+      thissme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, start_mv_sad,
+                                        &temp_mv, step_param + n, sadpb, &num00,
+                                        &sad_fn_ptr, ref_mv);
       if (thissme < INT_MAX)
         thissme = vp9_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
 
@@ -2549,8 +2566,8 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
   if (do_refine) {
     const int search_range = 8;
     MV best_mv = *dst_mv;
-    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range, fn_ptr,
-                                      ref_mv);
+    thissme = vp9_refining_search_sad(x, &best_mv, sadpb, search_range,
+                                      &sad_fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp9_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
@@ -2559,6 +2576,27 @@ static int full_pixel_diamond(const VP9_COMP *const cpi,
     }
   }
 
+  if (sad_fn_ptr.sdf != fn_ptr->sdf) {
+    // If we are skipping rows when we perform the motion search, we need to
+    // check the quality of skipping. If it's bad, then we run search with
+    // skip row features off.
+    const uint8_t *best_address = get_buf_from_mv(&xd->plane[0].pre[0], dst_mv);
+    const int sad =
+        fn_ptr->sdf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    const int skip_sad =
+        fn_ptr->sdsf(src_buf, src_buf_stride, best_address, pred_buf_stride);
+    // We will keep the result of skipping rows if it's good enough.
+    const int kSADThresh =
+        1 << (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
+    if (sad > kSADThresh && abs(skip_sad - sad) * 10 >= VPXMAX(sad, 1) * 9) {
+      // There is a large discrepancy between skipping and not skipping, so we
+      // need to redo the motion search.
+      return full_pixel_diamond(cpi, x, bsize, mvp_full, step_param, sadpb,
+                                further_steps, do_refine, 0, cost_list, fn_ptr,
+                                ref_mv, dst_mv);
+    }
+  }
+
   // Return cost list.
   if (cost_list) {
     calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
@@ -2710,7 +2748,7 @@ int64_t vp9_refining_search_sad_new(const MACROBLOCK *x, MV *best_full_mv,
 
 int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp9_sad_fn_ptr_t *sad_fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = { { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 } };
@@ -2719,7 +2757,7 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
   const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 };
   const uint8_t *best_address = get_buf_from_mv(in_what, ref_mv);
   unsigned int best_sad =
-      fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
+      sad_fn_ptr->sdf(what->buf, what->stride, best_address, in_what->stride) +
       mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
@@ -2736,7 +2774,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                                             best_address - 1, best_address + 1,
                                             best_address + in_what->stride };
 
-      fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride, sads);
+      sad_fn_ptr->sdx4df(what->buf, what->stride, positions, in_what->stride,
+                         sads);
 
       for (j = 0; j < 4; ++j) {
         if (sads[j] < best_sad) {
@@ -2756,8 +2795,8 @@ int vp9_refining_search_sad(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
 
         if (is_mv_in(&x->mv_limits, &mv)) {
           unsigned int sad =
-              fn_ptr->sdf(what->buf, what->stride,
-                          get_buf_from_mv(in_what, &mv), in_what->stride);
+              sad_fn_ptr->sdf(what->buf, what->stride,
+                              get_buf_from_mv(in_what, &mv), in_what->stride);
           if (sad < best_sad) {
             sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
             if (sad < best_sad) {
@@ -2874,9 +2913,10 @@ int vp9_full_pixel_search(const VP9_COMP *const cpi, const MACROBLOCK *const x,
       break;
     case NSTEP:
     case MESH:
-      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                               MAX_MVSEARCH_STEPS - 1 - step_param, 1,
-                               cost_list, fn_ptr, ref_mv, tmp_mv);
+      var = full_pixel_diamond(
+          cpi, x, bsize, mvp_full, step_param, error_per_bit,
+          MAX_MVSEARCH_STEPS - 1 - step_param, 1,
+          cpi->sf.mv.use_downsampled_sad, cost_list, fn_ptr, ref_mv, tmp_mv);
       break;
     default: assert(0 && "Unknown search method");
   }
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index bdaf2ce77..fd6a8b9ac 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -41,6 +41,11 @@ typedef struct search_site_config {
   int total_steps;
 } search_site_config;
 
+typedef struct vp9_sad_table {
+  vpx_sad_fn_t sdf;
+  vpx_sad_multi_d_fn_t sdx4df;
+} vp9_sad_fn_ptr_t;
+
 static INLINE const uint8_t *get_buf_from_mv(const struct buf_2d *buf,
                                              const MV *mv) {
   return &buf->buf[mv->row * buf->stride + mv->col];
@@ -63,12 +68,13 @@ int vp9_get_mvpred_av_var(const MACROBLOCK *x, const MV *best_mv,
 
 struct VP9_COMP;
 struct SPEED_FEATURES;
+struct vp9_sad_table;
 
 int vp9_init_search_range(int size);
 
 int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
                             int error_per_bit, int search_range,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            const struct vp9_sad_table *sad_fn_ptr,
                             const struct mv *center_mv);
 
 // Perform integral projection based motion estimation.
@@ -94,9 +100,9 @@ extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
 typedef int (*vp9_diamond_search_fn_t)(
-    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
-    int search_param, int sad_per_bit, int *num00,
-    const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv);
+    const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv,
+    uint32_t start_mv_sad, MV *best_mv, int search_param, int sad_per_bit,
+    int *num00, const vp9_sad_fn_ptr_t *sad_fn_ptr, const MV *center_mv);
 
 int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit,
                              int search_range,
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
index 45659f2a9..0843cd97e 100644
--- a/vp9/encoder/vp9_multi_thread.c
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -59,7 +59,7 @@ void vp9_row_mt_alloc_rd_thresh(VP9_COMP *const cpi,
   int i;
 
   CHECK_MEM_ERROR(
-      cm, this_tile->row_base_thresh_freq_fact,
+      &cm->error, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
                         sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
@@ -85,7 +85,7 @@ void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+  CHECK_MEM_ERROR(&cm->error, multi_thread_ctxt->job_queue,
                   (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
diff --git a/vp9/encoder/vp9_noise_estimate.c b/vp9/encoder/vp9_noise_estimate.c
index 9696529c5..4ee6e51ba 100644
--- a/vp9/encoder/vp9_noise_estimate.c
+++ b/vp9/encoder/vp9_noise_estimate.c
@@ -202,7 +202,7 @@ void vp9_update_noise_estimate(VP9_COMP *const cpi) {
                      VPXMIN(cpi->consec_zero_mv[bl_index1],
                             VPXMIN(cpi->consec_zero_mv[bl_index2],
                                    cpi->consec_zero_mv[bl_index3])));
-          // Only consider blocks that are likely steady background. i.e, have
+          // Only consider blocks that are likely steady background. i.e., have
           // been encoded as zero/low motion x (= thresh_consec_zeromv) frames
           // in a row. consec_zero_mv[] defined for 8x8 blocks, so consider all
           // 4 sub-blocks for 16x16 block. And exclude this frame if
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 579b466ca..6f2524b36 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -566,23 +566,26 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
 
     // Transform skipping test in UV planes.
     for (i = 1; i <= 2; i++) {
-      struct macroblock_plane *const p = &x->plane[i];
-      struct macroblockd_plane *const pd = &xd->plane[i];
-      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd);
+      struct macroblock_plane *const p_uv = &x->plane[i];
+      struct macroblockd_plane *const pd_uv = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(xd->mi[0], pd_uv);
       const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
-      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd_uv);
       const int uv_bw = b_width_log2_lookup[uv_bsize];
       const int uv_bh = b_height_log2_lookup[uv_bsize];
       const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
                      (uv_bh - b_height_log2_lookup[unit_size]);
-      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
-      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      const uint32_t uv_dc_thr =
+          pd_uv->dequant[0] * pd_uv->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr =
+          pd_uv->dequant[1] * pd_uv->dequant[1] >> (6 - sf);
       int j = i - 1;
 
       vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
       flag_preduv_computed[i - 1] = 1;
-      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(
-          p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p_uv->src.buf, p_uv->src.stride,
+                                           pd_uv->dst.buf, pd_uv->dst.stride,
+                                           &sse_uv[j]);
 
       if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
           (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
@@ -768,7 +771,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
   for (r = 0; r < max_blocks_high; r += block_step) {
     for (c = 0; c < num_4x4_w; c += block_step) {
       if (c < max_blocks_wide) {
-        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
         tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
         tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
         tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
@@ -783,22 +786,19 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
         switch (tx_size) {
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 256, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           case TX_8X8:
             vpx_hadamard_8x8(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 64, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
           default:
             assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
-                            dqcoeff, pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
+            vp9_quantize_fp(coeff, 16, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            scan_order);
             break;
         }
         *skippable &= (*eob == 0);
@@ -1395,8 +1395,8 @@ static void recheck_zeromv_after_denoising(
     RD_COST this_rdc;
     mi->mode = ZEROMV;
     mi->ref_frame[0] = LAST_FRAME;
-    mi->ref_frame[1] = NONE;
-    set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+    mi->ref_frame[1] = NO_REF_FRAME;
+    set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
     mi->mv[0].as_int = 0;
     mi->interp_filter = EIGHTTAP;
     if (cpi->sf.default_interp_filter == BILINEAR) mi->interp_filter = BILINEAR;
@@ -1414,7 +1414,7 @@ static void recheck_zeromv_after_denoising(
       this_rdc = *best_rdc;
       mi->mode = ctx_den->best_mode;
       mi->ref_frame[0] = ctx_den->best_ref_frame;
-      set_ref_ptrs(cm, xd, mi->ref_frame[0], NONE);
+      set_ref_ptrs(cm, xd, mi->ref_frame[0], NO_REF_FRAME);
       mi->interp_filter = ctx_den->best_pred_filter;
       if (ctx_den->best_ref_frame == INTRA_FRAME) {
         mi->mv[0].as_int = INVALID_MV;
@@ -1678,7 +1678,7 @@ static INLINE void init_best_pickmode(BEST_PICKMODE *bp) {
   bp->best_intra_tx_size = TX_SIZES;
   bp->best_pred_filter = EIGHTTAP;
   bp->best_mode_skip_txfm = SKIP_TXFM_NONE;
-  bp->best_second_ref_frame = NONE;
+  bp->best_second_ref_frame = NO_REF_FRAME;
   bp->best_pred = NULL;
 }
 
@@ -1872,8 +1872,8 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   vp9_rd_cost_reset(&best_rdc);
   vp9_rd_cost_reset(rd_cost);
   mi->sb_type = bsize;
-  mi->ref_frame[0] = NONE;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[0] = NO_REF_FRAME;
+  mi->ref_frame[1] = NO_REF_FRAME;
 
   mi->tx_size =
       VPXMIN(max_txsize_lookup[bsize], tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -1933,15 +1933,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
-      struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[LAST_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
     if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
-      if (vp9_is_scaled(sf)) {
+      struct scale_factors *const ref_sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
+      if (vp9_is_scaled(ref_sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
         inter_layer_ref = GOLDEN_FRAME;
       }
@@ -2051,7 +2051,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
     int comp_pred = 0;
     int force_mv_inter_layer = 0;
     PREDICTION_MODE this_mode;
-    second_ref_frame = NONE;
+    second_ref_frame = NO_REF_FRAME;
 
     if (idx < num_inter_modes) {
       this_mode = ref_mode_set[idx].pred_mode;
@@ -2628,7 +2628,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
         best_pickmode.best_mode = this_mode;
         best_pickmode.best_intra_tx_size = mi->tx_size;
         best_pickmode.best_ref_frame = INTRA_FRAME;
-        best_pickmode.best_second_ref_frame = NONE;
+        best_pickmode.best_second_ref_frame = NO_REF_FRAME;
         mi->uv_mode = this_mode;
         mi->mv[0].as_int = INVALID_MV;
         mi->mv[1].as_int = INVALID_MV;
@@ -2750,8 +2750,8 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
-  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NONE;
-  MV_REFERENCE_FRAME best_ref_frame = NONE;
+  MV_REFERENCE_FRAME ref_frame, second_ref_frame = NO_REF_FRAME;
+  MV_REFERENCE_FRAME best_ref_frame = NO_REF_FRAME;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
   int64_t best_rd = INT64_MAX;
@@ -2772,9 +2772,10 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
     if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
-      const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
-      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
-                           sf);
+      const struct scale_factors *const ref_sf =
+          &cm->frame_refs[ref_frame - 1].sf;
+      vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, ref_sf,
+                           ref_sf);
       vp9_find_mv_refs(cm, xd, xd->mi[0], ref_frame, candidates, mi_row, mi_col,
                        mbmi_ext->mode_context);
 
@@ -2789,7 +2790,7 @@ void vp9_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int mi_row,
   mi->tx_size = TX_4X4;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->interp_filter =
       cm->interp_filter == SWITCHABLE ? EIGHTTAP : cm->interp_filter;
 
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index dcc44449f..19edf166d 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -15,6 +15,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_encoder.h"
@@ -22,12 +23,14 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       const struct macroblock_plane *const mb_plane,
                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                       const int16_t *scan, const int16_t *iscan) {
+                       const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -53,15 +56,15 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *round_ptr,
-                              const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                              tran_low_t *dqcoeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   int i;
   int eob = -1;
-
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -86,12 +89,14 @@ void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -118,13 +123,14 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
   int i, eob = -1;
-
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round_fp;
+  const int16_t *quant_ptr = mb_plane->quant_fp;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -249,8 +255,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
   x->plane[0].quant_fp = quants->y_quant_fp[qindex];
-  memcpy(x->plane[0].round_fp, quants->y_round_fp[qindex],
-         8 * sizeof(*(x->plane[0].round_fp)));
+  x->plane[0].round_fp = quants->y_round_fp[qindex];
   x->plane[0].quant_shift = quants->y_quant_shift[qindex];
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
@@ -262,8 +267,7 @@ void vp9_init_plane_quantizers(VP9_COMP *cpi, MACROBLOCK *x) {
   for (i = 1; i < 3; i++) {
     x->plane[i].quant = quants->uv_quant[qindex];
     x->plane[i].quant_fp = quants->uv_quant_fp[qindex];
-    memcpy(x->plane[i].round_fp, quants->uv_round_fp[qindex],
-           8 * sizeof(*(x->plane[i].round_fp)));
+    x->plane[i].round_fp = quants->uv_round_fp[qindex];
     x->plane[i].quant_shift = quants->uv_quant_shift[qindex];
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index d9207f7a2..6452e349d 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -260,7 +260,7 @@ void vp9_update_buffer_level_preencode(VP9_COMP *cpi) {
 // for the layered rate control which involves cumulative buffer levels for
 // the temporal layers. Allow for using the timestamp(pts) delta for the
 // framerate when the set_ref_frame_config is used.
-static void update_buffer_level_svc_preencode(VP9_COMP *cpi) {
+void vp9_update_buffer_level_svc_preencode(VP9_COMP *cpi) {
   SVC *const svc = &cpi->svc;
   int i;
   // Set this to 1 to use timestamp delta for "framerate" under
@@ -680,7 +680,8 @@ static int adjust_q_cbr(const VP9_COMP *cpi, int q) {
     else
       q = qclamp;
   }
-  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN)
+  if (cpi->oxcf.content == VP9E_CONTENT_SCREEN &&
+      cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     vp9_cyclic_refresh_limit_q(cpi, &q);
   return VPXMAX(VPXMIN(q, cpi->rc.worst_quality), cpi->rc.best_quality);
 }
@@ -1150,8 +1151,9 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   if (frame_is_intra_only(cm)) {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
-      int delta_qindex = vp9_compute_qdelta(rc, q, q * 0.25, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      int delta_qindex =
+          vp9_compute_qdelta(rc, qstart, qstart * 0.25, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else if (rc->this_key_frame_forced) {
       // Handle the special case for key frames forced when we have reached
@@ -1195,7 +1197,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
     } else {
       q = rc->avg_frame_qindex[KEY_FRAME];
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
 
@@ -1206,12 +1208,14 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
 
     } else if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       int delta_qindex;
       if (cpi->refresh_alt_ref_frame)
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.40, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.40, cm->bit_depth);
       else
-        delta_qindex = vp9_compute_qdelta(rc, q, q * 0.50, cm->bit_depth);
+        delta_qindex =
+            vp9_compute_qdelta(rc, qstart, qstart * 0.50, cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
       active_best_quality = get_gf_active_quality(cpi, q, cm->bit_depth);
@@ -1219,11 +1223,12 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi,
   } else {
     if (oxcf->rc_mode == VPX_Q) {
       int qindex = cq_level;
-      double q = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
+      double qstart = vp9_convert_qindex_to_q(qindex, cm->bit_depth);
       double delta_rate[FIXED_GF_INTERVAL] = { 0.50, 1.0, 0.85, 1.0,
                                                0.70, 1.0, 0.85, 1.0 };
       int delta_qindex = vp9_compute_qdelta(
-          rc, q, q * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
+          rc, qstart,
+          qstart * delta_rate[cm->current_video_frame % FIXED_GF_INTERVAL],
           cm->bit_depth);
       active_best_quality = VPXMAX(qindex + delta_qindex, rc->best_quality);
     } else {
@@ -1355,7 +1360,7 @@ static void pick_kf_q_bound_two_pass(const VP9_COMP *cpi, int *bottom_index,
       active_best_quality /= 4;
     }
 
-    // Dont allow the active min to be lossless (q0) unlesss the max q
+    // Don't allow the active min to be lossless (q0) unlesss the max q
     // already indicates lossless.
     active_best_quality =
         VPXMIN(active_worst_quality, VPXMAX(1, active_best_quality));
@@ -1453,7 +1458,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
     } else {
       q = active_worst_quality;
     }
-    // For constrained quality dont allow Q less than the cq level
+    // For constrained quality don't allow Q less than the cq level
     if (oxcf->rc_mode == VPX_CQ) {
       if (q < cq_level) q = cq_level;
     }
@@ -1859,8 +1864,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
     rc->avg_frame_qindex[KEY_FRAME] =
         ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[KEY_FRAME] + qindex, 2);
     if (cpi->use_svc) {
-      int i = 0;
-      SVC *svc = &cpi->svc;
+      int i;
       for (i = 0; i < svc->number_temporal_layers; ++i) {
         const int layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, i,
                                            svc->number_temporal_layers);
@@ -1988,6 +1992,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
   rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
   if (cpi->use_svc && svc->spatial_layer_id < svc->number_spatial_layers - 1)
     svc->lower_layer_qindex = cm->base_qindex;
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -2008,6 +2013,7 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
     cpi->rc.buffer_level = cpi->rc.optimal_buffer_level;
     cpi->rc.bits_off_target = cpi->rc.optimal_buffer_level;
   }
+  cpi->deadline_mode_previous_frame = cpi->oxcf.mode;
 }
 
 int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
@@ -2033,7 +2039,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
 int vp9_calc_iframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
   static const int kf_ratio = 25;
   const RATE_CONTROL *rc = &cpi->rc;
-  const int target = rc->avg_frame_bandwidth * kf_ratio;
+  int target = rc->avg_frame_bandwidth;
+  if (target > INT_MAX / kf_ratio)
+    target = INT_MAX;
+  else
+    target = rc->avg_frame_bandwidth * kf_ratio;
   return vp9_rc_clamp_iframe_target_size(cpi, target);
 }
 
@@ -2111,7 +2121,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
   int target;
   if (!cpi->refresh_alt_ref_frame &&
       (cm->current_video_frame == 0 || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-       rc->frames_to_key == 0)) {
+       rc->frames_to_key == 0 ||
+       (cpi->oxcf.mode != cpi->deadline_mode_previous_frame))) {
     cm->frame_type = KEY_FRAME;
     rc->this_key_frame_forced =
         cm->current_video_frame != 0 && rc->frames_to_key == 0;
@@ -2165,12 +2176,12 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   if (diff > 0) {
     // Lower the target bandwidth for this frame.
     const int pct_low = (int)VPXMIN(diff / one_pct_bits, oxcf->under_shoot_pct);
-    target -= (target * pct_low) / 200;
+    target -= (int)(((int64_t)target * pct_low) / 200);
   } else if (diff < 0) {
     // Increase the target bandwidth for this frame.
     const int pct_high =
         (int)VPXMIN(-diff / one_pct_bits, oxcf->over_shoot_pct);
-    target += (target * pct_high) / 200;
+    target += (int)(((int64_t)target * pct_high) / 200);
   }
   if (oxcf->rc_max_inter_bitrate_pct) {
     const int max_rate =
@@ -2277,14 +2288,15 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   // Periodic key frames is based on the super-frame counter
   // (svc.current_superframe), also only base spatial layer is key frame.
   // Key frame is set for any of the following: very first frame, frame flags
-  // indicates key, superframe counter hits key frequency, or (non-intra) sync
-  // flag is set for spatial layer 0.
+  // indicates key, superframe counter hits key frequency,(non-intra) sync
+  // flag is set for spatial layer 0, or deadline mode changes.
   if ((cm->current_video_frame == 0 && !svc->previous_frame_is_intra_only) ||
       (cpi->frame_flags & FRAMEFLAGS_KEY) ||
       (cpi->oxcf.auto_key &&
        (svc->current_superframe % cpi->oxcf.key_freq == 0) &&
        !svc->previous_frame_is_intra_only && svc->spatial_layer_id == 0) ||
-      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
+      (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
     if (is_one_pass_svc(cpi)) {
@@ -2438,7 +2450,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
     vp9_cyclic_refresh_update_parameters(cpi);
 
   vp9_rc_set_frame_target(cpi, target);
-  if (cm->show_frame) update_buffer_level_svc_preencode(cpi);
+  if (cm->show_frame) vp9_update_buffer_level_svc_preencode(cpi);
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
       svc->spatial_layer_id == svc->first_spatial_layer_to_encode &&
@@ -2483,7 +2495,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int target;
   if ((cm->current_video_frame == 0) || (cpi->frame_flags & FRAMEFLAGS_KEY) ||
-      (cpi->oxcf.auto_key && rc->frames_to_key == 0)) {
+      (cpi->oxcf.auto_key && rc->frames_to_key == 0) ||
+      (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)) {
     cm->frame_type = KEY_FRAME;
     rc->frames_to_key = cpi->oxcf.key_freq;
     rc->kf_boost = DEFAULT_KF_BOOST;
@@ -2636,7 +2649,8 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
   RATE_CONTROL *const rc = &cpi->rc;
   int vbr_max_bits;
 
-  rc->avg_frame_bandwidth = (int)(oxcf->target_bandwidth / cpi->framerate);
+  rc->avg_frame_bandwidth =
+      (int)VPXMIN(oxcf->target_bandwidth / cpi->framerate, INT_MAX);
   rc->min_frame_bandwidth =
       (int)(rc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
 
@@ -2690,7 +2704,7 @@ static void vbr_rate_correction(VP9_COMP *cpi, int *this_frame_target) {
   }
 
   // Fast redistribution of bits arising from massive local undershoot.
-  // Dont do it for kf,arf,gf or overlay frames.
+  // Don't do it for kf,arf,gf or overlay frames.
   if (!frame_is_kf_gf_arf(cpi) && !rc->is_src_frame_alt_ref &&
       rc->vbr_bits_off_target_fast) {
     int one_frame_bits = VPXMAX(rc->avg_frame_bandwidth, *this_frame_target);
@@ -3269,11 +3283,9 @@ int vp9_encodedframe_overshoot(VP9_COMP *cpi, int frame_size, int *q) {
       MODE_INFO **mi = cm->mi_grid_visible;
       int sum_intra_usage = 0;
       int mi_row, mi_col;
-      int tot = 0;
       for (mi_row = 0; mi_row < cm->mi_rows; mi_row++) {
         for (mi_col = 0; mi_col < cm->mi_cols; mi_col++) {
           if (mi[0]->ref_frame[0] == INTRA_FRAME) sum_intra_usage++;
-          tot++;
           mi++;
         }
         mi += 8;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index 96a8fd3f1..48c49e937 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -350,6 +350,8 @@ void vp9_estimate_qp_gop(struct VP9_COMP *cpi);
 
 void vp9_compute_frame_low_motion(struct VP9_COMP *const cpi);
 
+void vp9_update_buffer_level_svc_preencode(struct VP9_COMP *cpi);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 58dd75b44..95c95971c 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -513,22 +513,6 @@ static void model_rd_norm(int xsq_q10, int *r_q10, int *d_q10) {
   *d_q10 = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
 }
 
-static void model_rd_norm_vec(int xsq_q10[MAX_MB_PLANE],
-                              int r_q10[MAX_MB_PLANE],
-                              int d_q10[MAX_MB_PLANE]) {
-  int i;
-  const int one_q10 = 1 << 10;
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const int tmp = (xsq_q10[i] >> 2) + 8;
-    const int k = get_msb(tmp) - 3;
-    const int xq = (k << 3) + ((tmp >> k) & 0x7);
-    const int a_q10 = ((xsq_q10[i] - xsq_iq_q10[xq]) << 10) >> (2 + k);
-    const int b_q10 = one_q10 - a_q10;
-    r_q10[i] = (rate_tab_q10[xq] * b_q10 + rate_tab_q10[xq + 1] * a_q10) >> 10;
-    d_q10[i] = (dist_tab_q10[xq] * b_q10 + dist_tab_q10[xq + 1] * a_q10) >> 10;
-  }
-}
-
 static const uint32_t MAX_XSQ_Q10 = 245727;
 
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
@@ -554,30 +538,6 @@ void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
   }
 }
 
-// Implements a fixed length vector form of vp9_model_rd_from_var_lapndz where
-// vectors are of length MAX_MB_PLANE and all elements of var are non-zero.
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum) {
-  int i;
-  int xsq_q10[MAX_MB_PLANE], d_q10[MAX_MB_PLANE], r_q10[MAX_MB_PLANE];
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    const uint64_t xsq_q10_64 =
-        (((uint64_t)qstep[i] * qstep[i] << (n_log2[i] + 10)) + (var[i] >> 1)) /
-        var[i];
-    xsq_q10[i] = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
-  }
-  model_rd_norm_vec(xsq_q10, r_q10, d_q10);
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    int rate =
-        ROUND_POWER_OF_TWO(r_q10[i] << n_log2[i], 10 - VP9_PROB_COST_SHIFT);
-    int64_t dist = (var[i] * (int64_t)d_q10[i] + 512) >> 10;
-    *rate_sum += rate;
-    *dist_sum += dist;
-  }
-}
-
 // Disable gcc 12.2 false positive warning.
 // warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
 #if defined(__GNUC__) && !defined(__clang__)
diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h
index d2bc5e60e..6c61ae514 100644
--- a/vp9/encoder/vp9_rd.h
+++ b/vp9/encoder/vp9_rd.h
@@ -121,11 +121,9 @@ typedef struct RD_OPT {
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#if CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int64_t prediction_type_threshes_prev[MAX_REF_FRAMES][REFERENCE_MODES];
 
   int64_t filter_threshes_prev[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-#endif  // CONFIG_CONSISTENT_RECODE || CONFIG_RATE_CTRL
   int RDMULT;
   int RDDIV;
   double r0;
@@ -166,11 +164,6 @@ void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex);
 void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2,
                                   unsigned int qstep, int *rate, int64_t *dist);
 
-void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
-                                      unsigned int n_log2[MAX_MB_PLANE],
-                                      unsigned int qstep[MAX_MB_PLANE],
-                                      int64_t *rate_sum, int64_t *dist_sum);
-
 int vp9_get_switchable_rate(const struct VP9_COMP *cpi,
                             const MACROBLOCKD *const xd);
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index a464ce38f..974e43c90 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -77,7 +77,7 @@ struct rdcost_block_args {
   int64_t best_rd;
   int exit_early;
   int use_fast_coef_costing;
-  const scan_order *so;
+  const ScanOrder *so;
   uint8_t skippable;
   struct buf_2d *this_recon;
 };
@@ -86,28 +86,28 @@ struct rdcost_block_args {
 
 #if !CONFIG_REALTIME_ONLY
 static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  { NEARESTMV, { LAST_FRAME, NONE } },
-  { NEARESTMV, { ALTREF_FRAME, NONE } },
-  { NEARESTMV, { GOLDEN_FRAME, NONE } },
+  { NEARESTMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARESTMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { DC_PRED, { INTRA_FRAME, NONE } },
+  { DC_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
-  { NEWMV, { LAST_FRAME, NONE } },
-  { NEWMV, { ALTREF_FRAME, NONE } },
-  { NEWMV, { GOLDEN_FRAME, NONE } },
+  { NEWMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEWMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEWMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { NEARMV, { LAST_FRAME, NONE } },
-  { NEARMV, { ALTREF_FRAME, NONE } },
-  { NEARMV, { GOLDEN_FRAME, NONE } },
+  { NEARMV, { LAST_FRAME, NO_REF_FRAME } },
+  { NEARMV, { ALTREF_FRAME, NO_REF_FRAME } },
+  { NEARMV, { GOLDEN_FRAME, NO_REF_FRAME } },
 
-  { ZEROMV, { LAST_FRAME, NONE } },
-  { ZEROMV, { GOLDEN_FRAME, NONE } },
-  { ZEROMV, { ALTREF_FRAME, NONE } },
+  { ZEROMV, { LAST_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { GOLDEN_FRAME, NO_REF_FRAME } },
+  { ZEROMV, { ALTREF_FRAME, NO_REF_FRAME } },
 
   { NEARESTMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEARESTMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { TM_PRED, { INTRA_FRAME, NONE } },
+  { TM_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 
   { NEARMV, { LAST_FRAME, ALTREF_FRAME } },
   { NEWMV, { LAST_FRAME, ALTREF_FRAME } },
@@ -117,20 +117,20 @@ static const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
   { ZEROMV, { LAST_FRAME, ALTREF_FRAME } },
   { ZEROMV, { GOLDEN_FRAME, ALTREF_FRAME } },
 
-  { H_PRED, { INTRA_FRAME, NONE } },
-  { V_PRED, { INTRA_FRAME, NONE } },
-  { D135_PRED, { INTRA_FRAME, NONE } },
-  { D207_PRED, { INTRA_FRAME, NONE } },
-  { D153_PRED, { INTRA_FRAME, NONE } },
-  { D63_PRED, { INTRA_FRAME, NONE } },
-  { D117_PRED, { INTRA_FRAME, NONE } },
-  { D45_PRED, { INTRA_FRAME, NONE } },
+  { H_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { V_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D135_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D207_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D153_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D63_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D117_PRED, { INTRA_FRAME, NO_REF_FRAME } },
+  { D45_PRED, { INTRA_FRAME, NO_REF_FRAME } },
 };
 
 static const REF_DEFINITION vp9_ref_order[MAX_REFS] = {
-  { { LAST_FRAME, NONE } },           { { GOLDEN_FRAME, NONE } },
-  { { ALTREF_FRAME, NONE } },         { { LAST_FRAME, ALTREF_FRAME } },
-  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NONE } },
+  { { LAST_FRAME, NO_REF_FRAME } },   { { GOLDEN_FRAME, NO_REF_FRAME } },
+  { { ALTREF_FRAME, NO_REF_FRAME } }, { { LAST_FRAME, ALTREF_FRAME } },
+  { { GOLDEN_FRAME, ALTREF_FRAME } }, { { INTRA_FRAME, NO_REF_FRAME } },
 };
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -160,10 +160,13 @@ static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx, int m, int n,
 }
 
 #if !CONFIG_REALTIME_ONLY
-static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
-                            MACROBLOCKD *xd, int *out_rate_sum,
-                            int64_t *out_dist_sum, int *skip_txfm_sb,
-                            int64_t *skip_sse_sb) {
+// Planewise build inter prediction and compute rdcost with early termination
+// option
+static int build_inter_pred_model_rd_earlyterm(
+    VP9_COMP *cpi, int mi_row, int mi_col, BLOCK_SIZE bsize, MACROBLOCK *x,
+    MACROBLOCKD *xd, int *out_rate_sum, int64_t *out_dist_sum,
+    int *skip_txfm_sb, int64_t *skip_sse_sb, int do_earlyterm,
+    int64_t best_rd) {
   // Note our transform coeffs are 8 times an orthogonal transform.
   // Hence quantizer step is also 8 times. To get effective quantizer
   // we need to divide by 8 before sending to modeling function.
@@ -176,19 +179,15 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
   int64_t total_sse = 0;
   int skip_flag = 1;
   const int shift = 6;
-  int64_t dist;
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd - 5 :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
                                                     3;
-  unsigned int qstep_vec[MAX_MB_PLANE];
-  unsigned int nlog2_vec[MAX_MB_PLANE];
-  unsigned int sum_sse_vec[MAX_MB_PLANE];
-  int any_zero_sum_sse = 0;
 
   x->pred_sse[ref] = 0;
 
+  // Build prediction signal, compute stats and RD cost on per-plane basis
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblock_plane *const p = &x->plane[i];
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -207,7 +206,14 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     int idx, idy;
     int lw = b_width_log2_lookup[unit_size] + 2;
     int lh = b_height_log2_lookup[unit_size] + 2;
+    unsigned int qstep;
+    unsigned int nlog2;
+    int64_t dist = 0;
 
+    // Build inter predictor
+    vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+
+    // Compute useful stats
     for (idy = 0; idy < bh; ++idy) {
       for (idx = 0; idx < bw; ++idx) {
         uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
@@ -243,46 +249,36 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, MACROBLOCK *x,
     }
 
     total_sse += sum_sse;
-    sum_sse_vec[i] = sum_sse;
-    any_zero_sum_sse = any_zero_sum_sse || (sum_sse == 0);
-    qstep_vec[i] = pd->dequant[1] >> dequant_shift;
-    nlog2_vec[i] = num_pels_log2_lookup[bs];
-  }
+    qstep = pd->dequant[1] >> dequant_shift;
+    nlog2 = num_pels_log2_lookup[bs];
 
-  // Fast approximate the modelling function.
-  if (cpi->sf.simple_model_rd_from_var) {
-    for (i = 0; i < MAX_MB_PLANE; ++i) {
+    // Fast approximate the modelling function.
+    if (cpi->sf.simple_model_rd_from_var) {
       int64_t rate;
-      const int64_t square_error = sum_sse_vec[i];
-      int quantizer = qstep_vec[i];
-
-      if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+      if (qstep < 120)
+        rate = ((int64_t)sum_sse * (280 - qstep)) >> (16 - VP9_PROB_COST_SHIFT);
       else
         rate = 0;
-      dist = (square_error * quantizer) >> 8;
+      dist = ((int64_t)sum_sse * qstep) >> 8;
       rate_sum += rate;
-      dist_sum += dist;
-    }
-  } else {
-    if (any_zero_sum_sse) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
-        int rate;
-        vp9_model_rd_from_var_lapndz(sum_sse_vec[i], nlog2_vec[i], qstep_vec[i],
-                                     &rate, &dist);
-        rate_sum += rate;
-        dist_sum += dist;
-      }
     } else {
-      vp9_model_rd_from_var_lapndz_vec(sum_sse_vec, nlog2_vec, qstep_vec,
-                                       &rate_sum, &dist_sum);
+      int rate;
+      vp9_model_rd_from_var_lapndz(sum_sse, nlog2, qstep, &rate, &dist);
+      rate_sum += rate;
+    }
+    dist_sum += dist;
+    if (do_earlyterm) {
+      if (RDCOST(x->rdmult, x->rddiv, rate_sum,
+                 dist_sum << VP9_DIST_SCALE_LOG2) >= best_rd)
+        return 1;
     }
   }
-
   *skip_txfm_sb = skip_flag;
   *skip_sse_sb = total_sse << VP9_DIST_SCALE_LOG2;
   *out_rate_sum = (int)rate_sum;
   *out_dist_sum = dist_sum << VP9_DIST_SCALE_LOG2;
+
+  return 0;
 }
 #endif  // !CONFIG_REALTIME_ONLY
 
@@ -462,11 +458,6 @@ static int cost_coeffs(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
   return cost;
 }
 
-static INLINE int num_4x4_to_edge(int plane_4x4_dim, int mb_to_edge_dim,
-                                  int subsampling_dim, int blk_dim) {
-  return plane_4x4_dim + (mb_to_edge_dim >> (5 + subsampling_dim)) - blk_dim;
-}
-
 // Copy all visible 4x4s in the transform block.
 static void copy_block_visible(const MACROBLOCKD *xd,
                                const struct macroblockd_plane *const pd,
@@ -567,47 +558,11 @@ static unsigned pixel_sse(const VP9_COMP *const cpi, const MACROBLOCKD *xd,
   return sse;
 }
 
-// Compute the squares sum squares on all visible 4x4s in the transform block.
-static int64_t sum_squares_visible(const MACROBLOCKD *xd,
-                                   const struct macroblockd_plane *const pd,
-                                   const int16_t *diff, const int diff_stride,
-                                   int blk_row, int blk_col,
-                                   const BLOCK_SIZE plane_bsize,
-                                   const BLOCK_SIZE tx_bsize) {
-  int64_t sse;
-  const int plane_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
-  const int plane_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
-  const int tx_4x4_w = num_4x4_blocks_wide_lookup[tx_bsize];
-  const int tx_4x4_h = num_4x4_blocks_high_lookup[tx_bsize];
-  int b4x4s_to_right_edge = num_4x4_to_edge(plane_4x4_w, xd->mb_to_right_edge,
-                                            pd->subsampling_x, blk_col);
-  int b4x4s_to_bottom_edge = num_4x4_to_edge(plane_4x4_h, xd->mb_to_bottom_edge,
-                                             pd->subsampling_y, blk_row);
-  if (tx_bsize == BLOCK_4X4 ||
-      (b4x4s_to_right_edge >= tx_4x4_w && b4x4s_to_bottom_edge >= tx_4x4_h)) {
-    assert(tx_4x4_w == tx_4x4_h);
-    sse = (int64_t)vpx_sum_squares_2d_i16(diff, diff_stride, tx_4x4_w << 2);
-  } else {
-    int r, c;
-    int max_r = VPXMIN(b4x4s_to_bottom_edge, tx_4x4_h);
-    int max_c = VPXMIN(b4x4s_to_right_edge, tx_4x4_w);
-    sse = 0;
-    // if we are in the unrestricted motion border.
-    for (r = 0; r < max_r; ++r) {
-      // Skip visiting the sub blocks that are wholly within the UMV.
-      for (c = 0; c < max_c; ++c) {
-        sse += (int64_t)vpx_sum_squares_2d_i16(
-            diff + r * diff_stride * 4 + c * 4, diff_stride, 4);
-      }
-    }
-  }
-  return sse;
-}
-
 static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
                        BLOCK_SIZE plane_bsize, int block, int blk_row,
                        int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse, struct buf_2d *out_recon) {
+                       int64_t *out_sse, struct buf_2d *out_recon,
+                       int sse_calc_done) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
@@ -633,15 +588,15 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
 
     if (x->skip_encode && !is_inter_block(xd->mi[0])) {
       // TODO(jingning): tune the model to better capture the distortion.
-      const int64_t p =
+      const int64_t mean_quant_error =
           (pd->dequant[1] * pd->dequant[1] * (1 << ss_txfrm_size)) >>
 #if CONFIG_VP9_HIGHBITDEPTH
           (shift + 2 + (bd - 8) * 2);
 #else
           (shift + 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      *out_dist += (p >> 4);
-      *out_sse += p;
+      *out_dist += (mean_quant_error >> 4);
+      *out_sse += mean_quant_error;
     }
   } else {
     const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
@@ -657,8 +612,12 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
     const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
     unsigned int tmp;
 
-    tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
-                    blk_col, plane_bsize, tx_bsize);
+    if (sse_calc_done) {
+      tmp = (unsigned int)(*out_sse);
+    } else {
+      tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
+                      blk_col, plane_bsize, tx_bsize);
+    }
     *out_sse = (int64_t)tmp * 16;
     if (out_recon) {
       const int out_recon_idx = 4 * (blk_row * out_recon->stride + blk_col);
@@ -754,20 +713,29 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dst_stride = pd->dst.stride;
   const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+  const int enable_trellis_opt = args->cpi->sf.trellis_opt_tx_rd.method;
+  const double trellis_opt_thresh = args->cpi->sf.trellis_opt_tx_rd.thresh;
+  int sse_calc_done = 0;
+#if CONFIG_MISMATCH_DEBUG
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip,
+    0,  // mi_row
+    0,  // mi_col
+    0   // output_enabled
+  };
+#else
+  struct encode_b_args encode_b_arg = {
+    x,    enable_trellis_opt, trellis_opt_thresh, &sse_calc_done,
+    &sse, args->t_above,      args->t_left,       &mi->skip
+  };
+#endif
 
   if (args->exit_early) return;
 
   if (!is_inter_block(mi)) {
-#if CONFIG_MISMATCH_DEBUG
-    struct encode_b_args intra_arg = {
-      x, x->block_qcoeff_opt, args->t_above, args->t_left, &mi->skip, 0, 0, 0
-    };
-#else
-    struct encode_b_args intra_arg = { x, x->block_qcoeff_opt, args->t_above,
-                                       args->t_left, &mi->skip };
-#endif
     vp9_encode_block_intra(plane, block, blk_row, blk_col, plane_bsize, tx_size,
-                           &intra_arg);
+                           &encode_b_arg);
     if (recon) {
       uint8_t *rec_ptr = &recon->buf[4 * (blk_row * recon->stride + blk_col)];
       copy_block_visible(xd, pd, dst, dst_stride, rec_ptr, recon->stride,
@@ -775,16 +743,21 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
     }
     if (x->block_tx_domain) {
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, /*recon =*/0);
+                 tx_size, &dist, &sse, /*out_recon=*/NULL, sse_calc_done);
     } else {
       const struct macroblock_plane *const p = &x->plane[plane];
       const int src_stride = p->src.stride;
-      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
       const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
       unsigned int tmp;
-      sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
-                                plane_bsize, tx_bsize);
+      if (!sse_calc_done) {
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+        const int16_t *diff =
+            &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+        int visible_width, visible_height;
+        sse = sum_squares_visible(xd, pd, diff, diff_stride, blk_row, blk_col,
+                                  plane_bsize, tx_bsize, &visible_width,
+                                  &visible_height);
+      }
 #if CONFIG_VP9_HIGHBITDEPTH
       if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8))
         sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
@@ -808,12 +781,18 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
 
     if (skip_txfm_flag == SKIP_TXFM_NONE ||
         (recon && skip_txfm_flag == SKIP_TXFM_AC_ONLY)) {
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+      const int16_t *const diff =
+          &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const int use_trellis_opt =
+          do_trellis_opt(pd, diff, diff_stride, blk_row, blk_col, plane_bsize,
+                         tx_size, &encode_b_arg);
       // full forward transform and quantization
       vp9_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-      if (x->block_qcoeff_opt)
-        vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
+      if (use_trellis_opt) vp9_optimize_b(x, plane, block, tx_size, coeff_ctx);
       dist_block(args->cpi, x, plane, plane_bsize, block, blk_row, blk_col,
-                 tx_size, &dist, &sse, recon);
+                 tx_size, &dist, &sse, recon, sse_calc_done);
     } else if (skip_txfm_flag == SKIP_TXFM_AC_ONLY) {
       // compute DC coefficient
       tran_low_t *const coeff = BLOCK_OFFSET(x->plane[plane].coeff, block);
@@ -1149,13 +1128,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst,
                                     dst_stride, xd->bd);
           if (xd->lossless) {
-            const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+            const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                  eob, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1166,16 +1144,15 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           } else {
             int64_t unused;
             const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+            const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             if (tx_type == DCT_DCT)
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                  eob, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1256,13 +1233,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
-          const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+          const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                         so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1273,13 +1249,12 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         } else {
           int64_t unused;
           const TX_TYPE tx_type = get_tx_type_4x4(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = &vp9_scan_orders[TX_4X4][tx_type];
+          const ScanOrder *so = &vp9_scan_orders[TX_4X4][tx_type];
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                         so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                         so);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1416,7 +1391,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
     mic->mode = mode;
 
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion, &s, NULL,
-                    bsize, best_rd, /*recon = */ 0);
+                    bsize, best_rd, /*recon=*/NULL);
 
     if (this_rate_tokenonly == INT_MAX) continue;
 
@@ -1456,7 +1431,6 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   if (ref_best_rd < 0) is_cost_valid = 0;
 
   if (is_inter_block(mi) && is_cost_valid) {
-    int plane;
     for (plane = 1; plane < MAX_MB_PLANE; ++plane)
       vp9_subtract_plane(x, bsize, plane);
   }
@@ -1469,7 +1443,7 @@ static int super_block_uvrd(const VP9_COMP *cpi, MACROBLOCK *x, int *rate,
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
     txfm_rd_in_plane(cpi, x, &pnrate, &pndist, &pnskip, &pnsse, ref_best_rd,
                      plane, bsize, uv_tx_size, cpi->sf.use_fast_coef_costing,
-                     /*recon = */ 0);
+                     /*recon=*/NULL);
     if (pnrate == INT_MAX) {
       is_cost_valid = 0;
       break;
@@ -1652,7 +1626,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       &pd->dst.buf[vp9_raster_block_offset(BLOCK_8X8, i, pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
-  const scan_order *so = &vp9_default_scan_orders[TX_4X4];
+  const ScanOrder *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
@@ -1732,14 +1706,12 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                      coeff, 8);
 #if CONFIG_VP9_HIGHBITDEPTH
-      vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
-                            p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                            so->scan, so->iscan);
+      vpx_highbd_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob,
+                            so);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
-      vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
-                     qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
+      vpx_quantize_b(coeff, 4 * 4, p, qcoeff, dqcoeff, pd->dequant, eob, so);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1833,7 +1805,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
                               const MV_REFERENCE_FRAME ref_frames[2]) {
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] == NO_REF_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
     int rfc = mode_context[ref_frames[0]];
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
@@ -1846,7 +1818,7 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] == NO_REF_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -1862,10 +1834,80 @@ static int check_best_zero_mv(const VP9_COMP *cpi,
   return 1;
 }
 
+static INLINE int skip_iters(const int_mv iter_mvs[][2], int ite, int id) {
+  if (ite >= 2 && iter_mvs[ite - 2][!id].as_int == iter_mvs[ite][!id].as_int) {
+    int_mv cur_fullpel_mv, prev_fullpel_mv;
+    cur_fullpel_mv.as_mv.row = iter_mvs[ite][id].as_mv.row >> 3;
+    cur_fullpel_mv.as_mv.col = iter_mvs[ite][id].as_mv.col >> 3;
+    prev_fullpel_mv.as_mv.row = iter_mvs[ite - 2][id].as_mv.row >> 3;
+    prev_fullpel_mv.as_mv.col = iter_mvs[ite - 2][id].as_mv.col >> 3;
+    if (cur_fullpel_mv.as_int == prev_fullpel_mv.as_int) return 1;
+  }
+  return 0;
+}
+
+// Compares motion vector and mode rate of current mode and given mode.
+static INLINE int compare_mv_mode_rate(MV this_mv, MV mode_mv,
+                                       int this_mode_rate, int mode_rate,
+                                       int mv_thresh) {
+  const int mv_diff =
+      abs(mode_mv.col - this_mv.col) + abs(mode_mv.row - this_mv.row);
+  if (mv_diff <= mv_thresh && mode_rate < this_mode_rate) return 1;
+  return 0;
+}
+
+// Skips single reference inter modes NEARMV and ZEROMV based on motion vector
+// difference and mode rate.
+static INLINE int skip_single_mode_based_on_mode_rate(
+    int_mv (*mode_mv)[MAX_REF_FRAMES], int *single_mode_rate, int this_mode,
+    int ref0, int this_mode_rate, int best_mode_index) {
+  MV this_mv = mode_mv[this_mode][ref0].as_mv;
+  const int mv_thresh = 3;
+
+  // Pruning is not applicable for NEARESTMV or NEWMV modes.
+  if (this_mode == NEARESTMV || this_mode == NEWMV) return 0;
+  // Pruning is not done when reference frame of the mode is same as best
+  // reference so far.
+  if (best_mode_index > 0 &&
+      ref0 == vp9_mode_order[best_mode_index].ref_frame[0])
+    return 0;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARESTMV
+  if (compare_mv_mode_rate(
+          this_mv, mode_mv[NEARESTMV][ref0].as_mv, this_mode_rate,
+          single_mode_rate[INTER_OFFSET(NEARESTMV)], mv_thresh))
+    return 1;
+
+  // Check absolute mv difference and mode rate of current mode w.r.t NEWMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEWMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEWMV)], mv_thresh))
+    return 1;
+
+  // Pruning w.r.t NEARMV is applicable only for ZEROMV mode
+  if (this_mode == NEARMV) return 0;
+  // Check absolute mv difference and mode rate of current mode w.r.t NEARMV
+  if (compare_mv_mode_rate(this_mv, mode_mv[NEARMV][ref0].as_mv, this_mode_rate,
+                           single_mode_rate[INTER_OFFSET(NEARMV)], mv_thresh))
+    return 1;
+  return 0;
+}
+
+#define MAX_JOINT_MV_SEARCH_ITERS 4
+static INLINE int get_joint_search_iters(int sf_level, BLOCK_SIZE bsize) {
+  int num_iters = MAX_JOINT_MV_SEARCH_ITERS;  // sf_level = 0
+  if (sf_level >= 2)
+    num_iters = 0;
+  else if (sf_level >= 1)
+    num_iters = bsize < BLOCK_8X8
+                    ? 0
+                    : (bsize <= BLOCK_16X16 ? 2 : MAX_JOINT_MV_SEARCH_ITERS);
+  return num_iters;
+}
+
 static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                                 int_mv *frame_mv, int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv, int num_iters) {
   const VP9_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1874,6 +1916,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
   const int refs[2] = { mi->ref_frame[0],
                         mi->ref_frame[1] < 0 ? 0 : mi->ref_frame[1] };
   int_mv ref_mv[2];
+  int_mv iter_mvs[MAX_JOINT_MV_SEARCH_ITERS][2];
   int ite, ref;
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
   struct scale_factors sf;
@@ -1888,12 +1931,15 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(32, uint16_t, second_pred_alloc_16[64 * 64]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(32, uint8_t, second_pred[64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  // Check number of iterations do not exceed the max
+  assert(num_iters <= MAX_JOINT_MV_SEARCH_ITERS);
+
   for (ref = 0; ref < 2; ++ref) {
     ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
@@ -1909,6 +1955,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     }
 
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
+    iter_mvs[0][ref].as_int = single_newmv[refs[ref]].as_int;
   }
 
 // Since we have scaled the reference frames to match the size of the current
@@ -1923,7 +1970,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
-  for (ite = 0; ite < 4; ite++) {
+  for (ite = 0; ite < num_iters; ite++) {
     struct buf_2d ref_yv12[2];
     uint32_t bestsme = UINT_MAX;
     int sadpb = x->sadperbit16;
@@ -1935,6 +1982,11 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
                        // odd iterations search in the second. The predictor
                        // found for the 'other' reference frame is factored in.
 
+    // Skip further iterations of search if in the previous iteration, the
+    // motion vector of the searched ref frame is unchanged, and the other ref
+    // frame's full-pixel mv is unchanged.
+    if (skip_iters(iter_mvs, ite, id)) break;
+
     // Initialized here because of compiler problem in Visual Studio.
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
@@ -2000,6 +2052,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
     } else {
       break;
     }
+    if (ite < num_iters - 1) {
+      iter_mvs[ite + 1][0].as_int = frame_mv[refs[0]].as_int;
+      iter_mvs[ite + 1][1].as_int = frame_mv[refs[1]].as_int;
+    }
   }
 
   *rate_mv = 0;
@@ -2020,7 +2076,7 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
 
 static int64_t rd_pick_best_sub8x8_mode(
     VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv,
-    int_mv *second_best_ref_mv, int64_t best_rd, int *returntotrate,
+    int_mv *second_best_ref_mv, int64_t best_rd_so_far, int *returntotrate,
     int *returnyrate, int64_t *returndistortion, int *skippable, int64_t *psse,
     int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], BEST_SEG_INFO *bsi_buf,
     int filter_idx, int mi_row, int mi_col) {
@@ -2053,7 +2109,7 @@ static int64_t rd_pick_best_sub8x8_mode(
 
   vp9_zero(*bsi);
 
-  bsi->segment_rd = best_rd;
+  bsi->segment_rd = best_rd_so_far;
   bsi->ref_mv[0] = best_ref_mv;
   bsi->ref_mv[1] = second_best_ref_mv;
   bsi->mvp.as_int = best_ref_mv->as_int;
@@ -2079,14 +2135,14 @@ static int64_t rd_pick_best_sub8x8_mode(
       int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
       PREDICTION_MODE mode_selected = ZEROMV;
       int64_t best_rd = INT64_MAX;
-      const int i = idy * 2 + idx;
+      const int block = idy * 2 + idx;
       int ref;
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mi->ref_frame[ref];
         frame_mv[ZEROMV][frame].as_int = 0;
         vp9_append_sub8x8_mvs_for_idx(
-            cm, xd, i, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
+            cm, xd, block, ref, mi_row, mi_col, &frame_mv[NEARESTMV][frame],
             &frame_mv[NEARMV][frame], mbmi_ext->mode_context);
       }
 
@@ -2096,7 +2152,7 @@ static int64_t rd_pick_best_sub8x8_mode(
         struct buf_2d orig_pre[2];
 
         mode_idx = INTER_OFFSET(this_mode);
-        bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
+        bsi->rdstat[block][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode))) continue;
 
         if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
@@ -2104,14 +2160,14 @@ static int64_t rd_pick_best_sub8x8_mode(
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
-        memcpy(bsi->rdstat[i][mode_idx].ta, t_above,
-               sizeof(bsi->rdstat[i][mode_idx].ta));
-        memcpy(bsi->rdstat[i][mode_idx].tl, t_left,
-               sizeof(bsi->rdstat[i][mode_idx].tl));
+        memcpy(bsi->rdstat[block][mode_idx].ta, t_above,
+               sizeof(bsi->rdstat[block][mode_idx].ta));
+        memcpy(bsi->rdstat[block][mode_idx].tl, t_left,
+               sizeof(bsi->rdstat[block][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
         if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV) {
+            seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV) {
           MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
           int step_param = 0;
           uint32_t bestsme = UINT_MAX;
@@ -2121,18 +2177,19 @@ static int64_t rd_pick_best_sub8x8_mode(
           int cost_list[5];
           const MvLimits tmp_mv_limits = x->mv_limits;
 
-          /* Is the best so far sufficiently good that we cant justify doing
+          /* Is the best so far sufficiently good that we can't justify doing
            * and new motion search. */
           if (best_rd < label_mv_thresh) break;
 
           if (cpi->oxcf.mode != BEST) {
             // use previous block's result as next block's MV predictor.
-            if (i > 0) {
-              bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
-              if (i == 2) bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
+            if (block > 0) {
+              bsi->mvp.as_int = mi->bmi[block - 1].as_mv[0].as_int;
+              if (block == 2)
+                bsi->mvp.as_int = mi->bmi[block - 2].as_mv[0].as_int;
             }
           }
-          if (i == 0)
+          if (block == 0)
             max_mv = x->max_mv_context[mi->ref_frame[0]];
           else
             max_mv =
@@ -2161,7 +2218,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           }
 
           // adjust src pointer for this block
-          mi_buf_shift(x, i);
+          mi_buf_shift(x, block);
 
           vp9_set_mv_search_range(&x->mv_limits, &bsi->ref_mv[0]->as_mv);
 
@@ -2184,7 +2241,7 @@ static int64_t rd_pick_best_sub8x8_mode(
                 cpi->sf.use_accurate_subpel_search);
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mi->ref_frame[0]].as_mv = *new_mv;
+            seg_mvs[block][mi->ref_frame[0]].as_mv = *new_mv;
           }
 
           x->pred_mv[mi->ref_frame[0]] = *new_mv;
@@ -2194,40 +2251,44 @@ static int64_t rd_pick_best_sub8x8_mode(
         }
 
         if (has_second_rf) {
-          if (seg_mvs[i][mi->ref_frame[1]].as_int == INVALID_MV ||
-              seg_mvs[i][mi->ref_frame[0]].as_int == INVALID_MV)
+          if (seg_mvs[block][mi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[block][mi->ref_frame[0]].as_int == INVALID_MV)
             continue;
         }
 
         if (has_second_rf && this_mode == NEWMV &&
             mi->interp_filter == EIGHTTAP) {
+          // Decide number of joint motion search iterations
+          const int num_joint_search_iters = get_joint_search_iters(
+              cpi->sf.comp_inter_joint_search_iter_level, bsize);
           // adjust src pointers
-          mi_buf_shift(x, i);
-          if (sf->comp_inter_joint_search_thresh <= bsize) {
+          mi_buf_shift(x, block);
+          if (num_joint_search_iters) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode], mi_row,
-                                mi_col, seg_mvs[i], &rate_mv);
-            seg_mvs[i][mi->ref_frame[0]].as_int =
+                                mi_col, seg_mvs[block], &rate_mv,
+                                num_joint_search_iters);
+            seg_mvs[block][mi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mi->ref_frame[0]].as_int;
-            seg_mvs[i][mi->ref_frame[1]].as_int =
+            seg_mvs[block][mi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mi->ref_frame[1]].as_int;
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
-        bsi->rdstat[i][mode_idx].brate = set_and_cost_bmi_mvs(
-            cpi, x, xd, i, this_mode, mode_mv[this_mode], frame_mv, seg_mvs[i],
-            bsi->ref_mv, x->nmvjointcost, x->mvcost);
+        bsi->rdstat[block][mode_idx].brate = set_and_cost_bmi_mvs(
+            cpi, x, xd, block, this_mode, mode_mv[this_mode], frame_mv,
+            seg_mvs[block], bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
-          bsi->rdstat[i][mode_idx].mvs[ref].as_int =
+          bsi->rdstat[block][mode_idx].mvs[ref].as_int =
               mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 1][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
+            bsi->rdstat[block + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
         }
 
@@ -2245,7 +2306,7 @@ static int64_t rd_pick_best_sub8x8_mode(
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
             have_ref &= mode_mv[this_mode][ref].as_int ==
-                        ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                        ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (filter_idx > 1 && !subpelmv && !have_ref) {
@@ -2253,53 +2314,55 @@ static int64_t rd_pick_best_sub8x8_mode(
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
               have_ref &= mode_mv[this_mode][ref].as_int ==
-                          ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+                          ref_bsi->rdstat[block][mode_idx].mvs[ref].as_int;
           }
 
           if (!subpelmv && have_ref &&
-              ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-            memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
-                   sizeof(SEG_RDSTAT));
+              ref_bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+            memcpy(&bsi->rdstat[block][mode_idx],
+                   &ref_bsi->rdstat[block][mode_idx], sizeof(SEG_RDSTAT));
             if (num_4x4_blocks_wide > 1)
-              bsi->rdstat[i + 1][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 1][mode_idx].eobs;
+              bsi->rdstat[block + 1][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 1][mode_idx].eobs;
             if (num_4x4_blocks_high > 1)
-              bsi->rdstat[i + 2][mode_idx].eobs =
-                  ref_bsi->rdstat[i + 2][mode_idx].eobs;
+              bsi->rdstat[block + 2][mode_idx].eobs =
+                  ref_bsi->rdstat[block + 2][mode_idx].eobs;
 
-            if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+            if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
               mode_selected = this_mode;
-              best_rd = bsi->rdstat[i][mode_idx].brdcost;
+              best_rd = bsi->rdstat[block][mode_idx].brdcost;
             }
             continue;
           }
         }
 
-        bsi->rdstat[i][mode_idx].brdcost = encode_inter_mb_segment(
-            cpi, x, bsi->segment_rd - this_segment_rd, i,
-            &bsi->rdstat[i][mode_idx].byrate, &bsi->rdstat[i][mode_idx].bdist,
-            &bsi->rdstat[i][mode_idx].bsse, bsi->rdstat[i][mode_idx].ta,
-            bsi->rdstat[i][mode_idx].tl, mi_row, mi_col);
-        if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
-          bsi->rdstat[i][mode_idx].brdcost +=
-              RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate, 0);
-          bsi->rdstat[i][mode_idx].brate += bsi->rdstat[i][mode_idx].byrate;
-          bsi->rdstat[i][mode_idx].eobs = p->eobs[i];
+        bsi->rdstat[block][mode_idx].brdcost = encode_inter_mb_segment(
+            cpi, x, bsi->segment_rd - this_segment_rd, block,
+            &bsi->rdstat[block][mode_idx].byrate,
+            &bsi->rdstat[block][mode_idx].bdist,
+            &bsi->rdstat[block][mode_idx].bsse, bsi->rdstat[block][mode_idx].ta,
+            bsi->rdstat[block][mode_idx].tl, mi_row, mi_col);
+        if (bsi->rdstat[block][mode_idx].brdcost < INT64_MAX) {
+          bsi->rdstat[block][mode_idx].brdcost += RDCOST(
+              x->rdmult, x->rddiv, bsi->rdstat[block][mode_idx].brate, 0);
+          bsi->rdstat[block][mode_idx].brate +=
+              bsi->rdstat[block][mode_idx].byrate;
+          bsi->rdstat[block][mode_idx].eobs = p->eobs[block];
           if (num_4x4_blocks_wide > 1)
-            bsi->rdstat[i + 1][mode_idx].eobs = p->eobs[i + 1];
+            bsi->rdstat[block + 1][mode_idx].eobs = p->eobs[block + 1];
           if (num_4x4_blocks_high > 1)
-            bsi->rdstat[i + 2][mode_idx].eobs = p->eobs[i + 2];
+            bsi->rdstat[block + 2][mode_idx].eobs = p->eobs[block + 2];
         }
 
-        if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+        if (bsi->rdstat[block][mode_idx].brdcost < best_rd) {
           mode_selected = this_mode;
-          best_rd = bsi->rdstat[i][mode_idx].brdcost;
+          best_rd = bsi->rdstat[block][mode_idx].brdcost;
         }
       } /*for each 4x4 mode*/
 
       if (best_rd == INT64_MAX) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2307,22 +2370,22 @@ static int64_t rd_pick_best_sub8x8_mode(
       }
 
       mode_idx = INTER_OFFSET(mode_selected);
-      memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
-      memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
+      memcpy(t_above, bsi->rdstat[block][mode_idx].ta, sizeof(t_above));
+      memcpy(t_left, bsi->rdstat[block][mode_idx].tl, sizeof(t_left));
 
-      set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
-                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
-                           x->mvcost);
+      set_and_cost_bmi_mvs(cpi, x, xd, block, mode_selected,
+                           mode_mv[mode_selected], frame_mv, seg_mvs[block],
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
-      br += bsi->rdstat[i][mode_idx].brate;
-      bd += bsi->rdstat[i][mode_idx].bdist;
-      block_sse += bsi->rdstat[i][mode_idx].bsse;
-      segmentyrate += bsi->rdstat[i][mode_idx].byrate;
-      this_segment_rd += bsi->rdstat[i][mode_idx].brdcost;
+      br += bsi->rdstat[block][mode_idx].brate;
+      bd += bsi->rdstat[block][mode_idx].bdist;
+      block_sse += bsi->rdstat[block][mode_idx].bsse;
+      segmentyrate += bsi->rdstat[block][mode_idx].byrate;
+      this_segment_rd += bsi->rdstat[block][mode_idx].brdcost;
 
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
-        for (iy = i + 1; iy < 4; ++iy)
+        for (iy = block + 1; iy < 4; ++iy)
           for (midx = 0; midx < INTER_MODES; ++midx)
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
@@ -2340,7 +2403,7 @@ static int64_t rd_pick_best_sub8x8_mode(
   // update the coding decisions
   for (k = 0; k < 4; ++k) bsi->modes[k] = mi->bmi[k].as_mode;
 
-  if (bsi->segment_rd > best_rd) return INT64_MAX;
+  if (bsi->segment_rd > best_rd_so_far) return INT64_MAX;
   /* set it to the best */
   for (i = 0; i < 4; i++) {
     mode_idx = INTER_OFFSET(bsi->modes[i]);
@@ -2585,9 +2648,9 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
           tmp_mv->as_int = INVALID_MV;
 
           if (scaled_ref_frame) {
-            int i;
-            for (i = 0; i < MAX_MB_PLANE; ++i)
-              xd->plane[i].pre[0] = backup_yv12[i];
+            int j;
+            for (j = 0; j < MAX_MB_PLANE; ++j)
+              xd->plane[j].pre[0] = backup_yv12[j];
           }
           return;
         }
@@ -2752,8 +2815,9 @@ static int64_t handle_inter_mode(
     struct buf_2d *recon, int *disable_skip, int_mv (*mode_mv)[MAX_REF_FRAMES],
     int mi_row, int mi_col, int_mv single_newmv[MAX_REF_FRAMES],
     INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
-    int (*single_skippable)[MAX_REF_FRAMES], int64_t *psse,
-    const int64_t ref_best_rd, int64_t *mask_filter, int64_t filter_cache[]) {
+    int (*single_skippable)[MAX_REF_FRAMES], int *single_mode_rate,
+    int64_t *psse, const int64_t ref_best_rd, int64_t *mask_filter,
+    int64_t filter_cache[], int best_mode_index) {
   VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
@@ -2771,9 +2835,8 @@ static int64_t handle_inter_mode(
 #else
   DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  int pred_exists = 0;
   int intpel_mv;
-  int64_t rd, tmp_rd, best_rd = INT64_MAX;
+  int64_t rd, tmp_rd = INT64_MAX, best_rd = INT64_MAX;
   int best_needs_copy = 0;
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
@@ -2782,13 +2845,12 @@ static int64_t handle_inter_mode(
   uint8_t skip_txfm[MAX_MB_PLANE << 2] = { 0 };
   int64_t bsse[MAX_MB_PLANE << 2] = { 0 };
 
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search =
-      cpi->sf.cb_pred_filter_search
-          ? (((mi_row + mi_col) >> bsl) +
-             get_chessboard_index(cm->current_video_frame)) &
-                0x1
-          : 0;
+  const int bsl = mi_width_log2_lookup[bsize];
+  const int blk_parity = (((mi_row + mi_col) >> bsl) +
+                          get_chessboard_index(cm->current_video_frame)) &
+                         0x1;
+  const int pred_filter_search =
+      (cpi->sf.cb_pred_filter_search >= 2) && blk_parity;
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
@@ -2827,13 +2889,23 @@ static int64_t handle_inter_mode(
   if (this_mode == NEWMV) {
     int rate_mv;
     if (is_comp_pred) {
+      // Decide number of joint motion search iterations
+      const int num_joint_search_iters = get_joint_search_iters(
+          cpi->sf.comp_inter_joint_search_iter_level, bsize);
+
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
-      if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+      if (num_joint_search_iters) {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        start_timing(cpi, joint_motion_search_time);
+#endif
         joint_motion_search(cpi, x, bsize, frame_mv, mi_row, mi_col,
-                            single_newmv, &rate_mv);
+                            single_newmv, &rate_mv, num_joint_search_iters);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+        end_timing(cpi, joint_motion_search_time);
+#endif
       } else {
         rate_mv = vp9_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
@@ -2845,7 +2917,13 @@ static int64_t handle_inter_mode(
       *rate2 += rate_mv;
     } else {
       int_mv tmp_mv;
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, single_motion_search_time);
+#endif
       single_motion_search(cpi, x, bsize, mi_row, mi_col, &tmp_mv, &rate_mv);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, single_motion_search_time);
+#endif
       if (tmp_mv.as_int == INVALID_MV) return INT64_MAX;
 
       frame_mv[refs[0]].as_int = xd->mi[0]->bmi[0].as_mv[0].as_int =
@@ -2899,23 +2977,45 @@ static int64_t handle_inter_mode(
     *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
   }
 
+  if (!is_comp_pred && cpi->sf.prune_single_mode_based_on_mv_diff_mode_rate) {
+    single_mode_rate[INTER_OFFSET(this_mode)] = *rate2;
+    // Prune NEARMV and ZEROMV modes based on motion vector difference and mode
+    // rate.
+    if (skip_single_mode_based_on_mode_rate(mode_mv, single_mode_rate,
+                                            this_mode, refs[0], *rate2,
+                                            best_mode_index)) {
+      // Check when the single inter mode is pruned, NEARESTMV or NEWMV modes
+      // are not early terminated. This ensures all single modes are not getting
+      // skipped when the speed feature is enabled.
+      assert(single_mode_rate[INTER_OFFSET(NEARESTMV)] != INT_MAX ||
+             single_mode_rate[INTER_OFFSET(NEWMV)] != INT_MAX);
+      return INT64_MAX;
+    }
+  }
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
       mi->mode != NEARESTMV)
     return INT64_MAX;
 
-  pred_exists = 0;
   // Are all MVs integer pel for Y and UV
   intpel_mv = !mv_has_subpel(&mi->mv[0].as_mv);
   if (is_comp_pred) intpel_mv &= !mv_has_subpel(&mi->mv[1].as_mv);
 
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  start_timing(cpi, interp_filter_time);
+#endif
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) filter_cache[i] = INT64_MAX;
 
   if (cm->interp_filter != BILINEAR) {
+    // Use cb pattern for filter eval when filter is not switchable
+    const int enable_interp_search =
+        (cpi->sf.cb_pred_filter_search && cm->interp_filter != SWITCHABLE)
+            ? blk_parity
+            : 1;
     if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
       best_filter = EIGHTTAP;
-    } else if (best_filter == SWITCHABLE) {
+    } else if (best_filter == SWITCHABLE && enable_interp_search) {
       int newbest;
       int tmp_rate_sum = 0;
       int64_t tmp_dist_sum = 0;
@@ -2925,6 +3025,9 @@ static int64_t handle_inter_mode(
         int64_t rs_rd;
         int tmp_skip_sb = 0;
         int64_t tmp_skip_sse = INT64_MAX;
+        const int enable_earlyterm =
+            cpi->sf.early_term_interp_search_plane_rd && cm->interp_filter != i;
+        int64_t filt_best_rd;
 
         mi->interp_filter = i;
         rs = vp9_get_switchable_rate(cpi, xd);
@@ -2958,9 +3061,16 @@ static int64_t handle_inter_mode(
               xd->plane[j].dst.stride = 64;
             }
           }
-          vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum, &tmp_skip_sb,
-                          &tmp_skip_sse);
+
+          filt_best_rd =
+              cm->interp_filter == SWITCHABLE ? (best_rd - rs_rd) : best_rd;
+          if (build_inter_pred_model_rd_earlyterm(
+                  cpi, mi_row, mi_col, bsize, x, xd, &rate_sum, &dist_sum,
+                  &tmp_skip_sb, &tmp_skip_sse, enable_earlyterm,
+                  filt_best_rd)) {
+            filter_cache[i] = INT64_MAX;
+            continue;
+          }
 
           rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
           filter_cache[i] = rd;
@@ -2993,7 +3103,6 @@ static int64_t handle_inter_mode(
         if ((cm->interp_filter == SWITCHABLE && newbest) ||
             (cm->interp_filter != SWITCHABLE &&
              cm->interp_filter == mi->interp_filter)) {
-          pred_exists = 1;
           tmp_rd = best_rd;
 
           skip_txfm_sb = tmp_skip_sb;
@@ -3005,12 +3114,15 @@ static int64_t handle_inter_mode(
       restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
+#if CONFIG_COLLECT_COMPONENT_TIMING
+  end_timing(cpi, interp_filter_time);
+#endif
   // Set the appropriate filter
   mi->interp_filter =
       cm->interp_filter != SWITCHABLE ? cm->interp_filter : best_filter;
   rs = cm->interp_filter == SWITCHABLE ? vp9_get_switchable_rate(cpi, xd) : 0;
 
-  if (pred_exists) {
+  if (tmp_rd != INT64_MAX) {
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -3025,9 +3137,9 @@ static int64_t handle_inter_mode(
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
-    vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
-                    &skip_sse_sb);
+    build_inter_pred_model_rd_earlyterm(
+        cpi, mi_row, mi_col, bsize, x, xd, &tmp_rate, &tmp_dist, &skip_txfm_sb,
+        &skip_sse_sb, 0 /*do_earlyterm*/, INT64_MAX);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
     memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
     memcpy(bsse, x->bsse, sizeof(bsse));
@@ -3120,7 +3232,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *rd_cost,
   x->skip_encode = 0;
   ctx->skip = 0;
   xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-  xd->mi[0]->ref_frame[1] = NONE;
+  xd->mi[0]->ref_frame[1] = NO_REF_FRAME;
   // Initialize interp_filter here so we do not have to check for inter block
   // modes in get_pred_context_switchable_interp()
   xd->mi[0]->interp_filter = SWITCHABLE_FILTERS;
@@ -3344,6 +3456,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
+  int single_mode_rate[MAX_REF_FRAMES][INTER_MODES];
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3493,7 +3606,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         mode_skip_mask[GOLDEN_FRAME] |= INTER_ALL;
   }
 
-  if (bsize > sf->max_intra_bsize) {
+  if (bsize > sf->max_intra_bsize && cpi->ref_frame_flags != 0) {
     ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
     ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
   }
@@ -3542,6 +3655,10 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
 
     vp9_zero(x->sum_y_eobs);
+    comp_pred = second_ref_frame > INTRA_FRAME;
+    if (!comp_pred && ref_frame != INTRA_FRAME &&
+        sf->prune_single_mode_based_on_mv_diff_mode_rate)
+      single_mode_rate[ref_frame][INTER_OFFSET(this_mode)] = INT_MAX;
 
     if (is_rect_partition) {
       if (ctx->skip_ref_frame_mask & (1 << ref_frame)) continue;
@@ -3560,7 +3677,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           break;
         case ALTREF_FRAME: ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK; break;
-        case NONE:
+        case NO_REF_FRAME:
         case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
       }
     }
@@ -3593,7 +3710,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
       MODE_INFO *ref_mi;
       int const_motion = 1;
       int skip_ref_frame = !cb_partition_search_ctrl;
-      MV_REFERENCE_FRAME rf = NONE;
+      MV_REFERENCE_FRAME rf = NO_REF_FRAME;
       int_mv ref_mv;
       ref_mv.as_int = INVALID_MV;
 
@@ -3610,7 +3727,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if ((mi_col - 1) >= tile_info->mi_col_start) {
         if (ref_mv.as_int == INVALID_MV) ref_mv = xd->mi[-1]->mv[0];
-        if (rf == NONE) rf = xd->mi[-1]->ref_frame[0];
+        if (rf == NO_REF_FRAME) rf = xd->mi[-1]->ref_frame[0];
         for (i = 0; i < mi_height; ++i) {
           ref_mi = xd->mi[i * xd->mi_stride - 1];
           const_motion &= (ref_mv.as_int == ref_mi->mv[0].as_int) &&
@@ -3627,7 +3744,6 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         if (this_mode == NEARMV || this_mode == ZEROMV) continue;
     }
 
-    comp_pred = second_ref_frame > INTRA_FRAME;
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
 
@@ -3707,19 +3823,30 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL, bsize,
                       best_rd, recon);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_y == INT_MAX) continue;
 
       uv_tx = uv_txsize_lookup[bsize][mi->tx_size][pd->subsampling_x]
                               [pd->subsampling_y];
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, intra_mode_search_time);
+#endif
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx], &dist_uv[uv_tx],
                              &skip_uv[uv_tx], &mode_uv[uv_tx]);
       }
-
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, intra_mode_search_time);
+#endif
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
@@ -3730,11 +3857,18 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
     } else {
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      start_timing(cpi, handle_inter_mode_time);
+#endif
       this_rd = handle_inter_mode(
           cpi, x, bsize, &rate2, &distortion2, &skippable, &rate_y, &rate_uv,
           recon, &disable_skip, frame_mv, mi_row, mi_col, single_newmv,
-          single_inter_filter, single_skippable, &total_sse, best_rd,
-          &mask_filter, filter_cache);
+          single_inter_filter, single_skippable,
+          &single_mode_rate[ref_frame][0], &total_sse, best_rd, &mask_filter,
+          filter_cache, best_mode_index);
+#if CONFIG_COLLECT_COMPONENT_TIMING
+      end_timing(cpi, handle_inter_mode_time);
+#endif
       if (this_rd == INT64_MAX) continue;
 
       compmode_cost = vp9_cost_bit(comp_mode_p, comp_pred);
@@ -3970,13 +4104,9 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
-// If adaptive interp filter is enabled, then the current leaf node of 8x8
-// data is needed for sub8x8. Hence preserve the context.
-#if CONFIG_CONSISTENT_RECODE
+    // If adaptive interp filter is enabled, then the current leaf node of 8x8
+    // data is needed for sub8x8. Hence preserve the context.
     if (bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#else
-    if (cpi->row_mt && bsize == BLOCK_8X8) ctx->mic = *xd->mi[0];
-#endif
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
     return;
@@ -4091,7 +4221,7 @@ void vp9_rd_pick_inter_mode_sb_seg_skip(VP9_COMP *cpi, TileDataEnc *tile_data,
   mi->mode = ZEROMV;
   mi->uv_mode = DC_PRED;
   mi->ref_frame[0] = LAST_FRAME;
-  mi->ref_frame[1] = NONE;
+  mi->ref_frame[1] = NO_REF_FRAME;
   mi->mv[0].as_int = 0;
   x->skip = 1;
 
@@ -4236,7 +4366,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
-    int i;
     int this_skip2 = 0;
     int64_t total_sse = INT_MAX;
     int early_term = 0;
@@ -4274,7 +4403,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
           case ALTREF_FRAME:
             ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
             break;
-          case NONE:
+          case NO_REF_FRAME:
           case MAX_REF_FRAMES: assert(0 && "Invalid Reference frame"); break;
         }
       }
@@ -4397,7 +4526,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
                 : NULL;
 
         if (scaled_ref_frame[ref]) {
-          int i;
           // Swap out the reference frame for a version that's been scaled to
           // match the resolution of the current frame, allowing the existing
           // motion search code to be used without additional modifications.
@@ -4534,14 +4662,13 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
 
       if (tmp_best_rdu > 0) {
         // If even the 'Y' rd value of split is higher than best so far
-        // then dont bother looking at UV
+        // then don't bother looking at UV
         vp9_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col, BLOCK_8X8);
         memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu)) {
           for (ref = 0; ref < 2; ++ref) {
             if (scaled_ref_frame[ref]) {
-              int i;
               for (i = 0; i < MAX_MB_PLANE; ++i)
                 xd->plane[i].pre[ref] = backup_yv12[ref][i];
             }
@@ -4558,7 +4685,6 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
       for (ref = 0; ref < 2; ++ref) {
         if (scaled_ref_frame[ref]) {
           // Restore the prediction frame pointers to their unscaled versions.
-          int i;
           for (i = 0; i < MAX_MB_PLANE; ++i)
             xd->plane[i].pre[ref] = backup_yv12[ref][i];
         }
@@ -4764,7 +4890,7 @@ void vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, TileDataEnc *tile_data,
     mi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
   }
   // If the second reference does not exist, set the corresponding mv to zero.
-  if (mi->ref_frame[1] == NONE) {
+  if (mi->ref_frame[1] == NO_REF_FRAME) {
     mi->mv[1].as_int = 0;
     for (i = 0; i < 4; ++i) {
       mi->bmi[i].as_mv[1].as_int = 0;
diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c
index 7486dee25..ca55ec988 100644
--- a/vp9/encoder/vp9_resize.c
+++ b/vp9/encoder/vp9_resize.c
@@ -360,6 +360,12 @@ static int get_down2_steps(int in_length, int out_length) {
   while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) {
     ++steps;
     in_length = proj_in_length;
+    if (in_length == 1) {
+      // Special case: we break because any further calls to get_down2_length()
+      // with be with length == 1, which return 1, resulting in an infinite
+      // loop.
+      break;
+    }
   }
   return steps;
 }
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 0431d8a45..56fb5f94f 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -16,8 +16,11 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 
 // Mesh search patters for various speed settings
-static MESH_PATTERN best_quality_mesh_pattern[MAX_MESH_STEP] = {
-  { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 }
+// Define 2 mesh density levels for FC_GRAPHICS_ANIMATION content type and non
+// FC_GRAPHICS_ANIMATION content type.
+static MESH_PATTERN best_quality_mesh_pattern[2][MAX_MESH_STEP] = {
+  { { 64, 4 }, { 28, 2 }, { 15, 1 }, { 7, 1 } },
+  { { 64, 8 }, { 28, 4 }, { 15, 1 }, { 7, 1 } },
 };
 
 #if !CONFIG_REALTIME_ONLY
@@ -39,7 +42,7 @@ static int frame_is_boosted(const VP9_COMP *cpi) {
 // Sets a partition size down to which the auto partition code will always
 // search (can go lower), based on the image dimensions. The logic here
 // is that the extent to which ringing artefacts are offensive, depends
-// partly on the screen area that over which they propogate. Propogation is
+// partly on the screen area that over which they propagate. Propagation is
 // limited by transform block size but the screen area take up by a given block
 // size will be larger for a small image format stretched to full screen.
 static BLOCK_SIZE set_partition_min_limit(VP9_COMMON *const cm) {
@@ -67,6 +70,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
   const int is_720p_or_larger = min_frame_size >= 720;
   const int is_1080p_or_larger = min_frame_size >= 1080;
   const int is_2160p_or_larger = min_frame_size >= 2160;
+  const int boosted = frame_is_boosted(cpi);
 
   // speed 0 features
   sf->partition_search_breakout_thr.dist = (1 << 20);
@@ -78,9 +82,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     // Currently, the machine-learning based partition search early termination
     // is only used while VPXMIN(cm->width, cm->height) >= 480 and speed = 0.
     sf->rd_ml_partition.search_early_termination = 1;
+    sf->recode_tolerance_high = 45;
   } else {
     sf->use_square_only_thresh_high = BLOCK_32X32;
   }
+  if (is_720p_or_larger) {
+    sf->alt_ref_search_fp = 1;
+  }
 
   if (!is_1080p_or_larger) {
     sf->rd_ml_partition.search_breakout = 1;
@@ -95,6 +103,13 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
     }
   }
 
+  if (!is_720p_or_larger) {
+    if (is_480p_or_larger)
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = boosted ? 0 : 1;
+    else
+      sf->prune_single_mode_based_on_mv_diff_mode_rate = 1;
+  }
+
   if (speed >= 1) {
     sf->rd_ml_partition.search_early_termination = 0;
     sf->rd_ml_partition.search_breakout = 1;
@@ -152,7 +167,7 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
       sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
       sf->alt_ref_search_fp = 1;
-      sf->cb_pred_filter_search = 1;
+      sf->cb_pred_filter_search = 2;
       sf->adaptive_interp_filter_search = 1;
       sf->disable_split_mask = DISABLE_ALL_SPLIT;
     }
@@ -209,15 +224,32 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   const int boosted = frame_is_boosted(cpi);
   int i;
 
-  sf->tx_size_search_breakout = 1;
+  sf->adaptive_interp_filter_search = 1;
+  sf->adaptive_pred_interp_filter = 1;
   sf->adaptive_rd_thresh = 1;
   sf->adaptive_rd_thresh_row_mt = 0;
   sf->allow_skip_recode = 1;
   sf->less_rectangular_check = 1;
-  sf->use_square_partition_only = !boosted;
+  sf->mv.auto_mv_step_size = 1;
+  sf->mv.use_downsampled_sad = 1;
   sf->prune_ref_frame_for_rect_partitions = 1;
-  sf->rd_ml_partition.var_pruning = 1;
+  sf->temporal_filter_search_method = NSTEP;
+  sf->tx_size_search_breakout = 1;
+  sf->use_square_partition_only = !boosted;
+  sf->early_term_interp_search_plane_rd = 1;
+  sf->cb_pred_filter_search = 1;
+  sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                     ? ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE
+                                     : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = boosted ? 4.0 : 3.0;
+
+  sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
+  sf->comp_inter_joint_search_iter_level = 1;
+
+  // Reference masking is not supported in dynamic scaling mode.
+  sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC;
 
+  sf->rd_ml_partition.var_pruning = 1;
   sf->rd_ml_partition.prune_rect_thresh[0] = -1;
   sf->rd_ml_partition.prune_rect_thresh[1] = 350;
   sf->rd_ml_partition.prune_rect_thresh[2] = 325;
@@ -238,7 +270,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 1) {
-    sf->temporal_filter_search_method = NSTEP;
     sf->rd_ml_partition.var_pruning = !boosted;
     sf->rd_ml_partition.prune_rect_thresh[1] = 225;
     sf->rd_ml_partition.prune_rect_thresh[2] = 225;
@@ -258,19 +289,18 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
 
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = tx_dom_thresholds[(speed < 6) ? speed : 5];
-    sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-    sf->quant_opt_thresh = qopt_thresholds[(speed < 6) ? speed : 5];
+    sf->trellis_opt_tx_rd.method = sf->optimize_coefficients
+                                       ? ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR
+                                       : DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = qopt_thresholds[(speed < 6) ? speed : 5];
     sf->less_rectangular_check = 1;
     sf->use_rd_breakout = 1;
     sf->adaptive_motion_search = 1;
-    sf->mv.auto_mv_step_size = 1;
     sf->adaptive_rd_thresh = 2;
     sf->mv.subpel_search_level = 1;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) sf->mode_skip_start = 10;
-    sf->adaptive_pred_interp_filter = 1;
     sf->allow_acl = 0;
 
-    sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V;
     if (cpi->oxcf.content != VP9E_CONTENT_FILM) {
       sf->intra_y_mode_mask[TX_16X16] = INTRA_DC_H_V;
@@ -296,18 +326,14 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->tx_size_search_method =
         frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
 
-    // Reference masking is not supported in dynamic scaling mode.
-    sf->reference_masking = oxcf->resize_mode != RESIZE_DYNAMIC ? 1 : 0;
-
     sf->mode_search_skip_flags =
         (cm->frame_type == KEY_FRAME)
             ? 0
             : FLAG_SKIP_INTRA_DIRMISMATCH | FLAG_SKIP_INTRA_BESTINTER |
                   FLAG_SKIP_COMP_BESTINTRA | FLAG_SKIP_INTRA_LOWVAR;
     sf->disable_filter_search_var_thresh = 100;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
-    sf->recode_tolerance_low = 15;
     sf->recode_tolerance_high = 45;
     sf->enhanced_full_pixel_motion_search = 0;
     sf->prune_ref_frame_for_rect_partitions = 0;
@@ -337,14 +363,13 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
     sf->adaptive_pred_interp_filter = 0;
     sf->adaptive_mode_search = 1;
     sf->cb_partition_search = !boosted;
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->alt_ref_search_fp = 1;
     sf->recode_loop = ALLOW_RECODE_KFMAXBW;
     sf->adaptive_rd_thresh = 3;
     sf->mode_skip_start = 6;
     sf->intra_y_mode_mask[TX_32X32] = INTRA_DC;
     sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC;
-    sf->adaptive_interp_filter_search = 1;
 
     if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
       for (i = 0; i < MAX_MESH_STEP; ++i) {
@@ -373,7 +398,6 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
   }
 
   if (speed >= 5) {
-    int i;
     sf->optimize_coefficients = 0;
     sf->mv.search_method = HEX;
     sf->disable_filter_search_var_thresh = 500;
@@ -461,8 +485,8 @@ static void set_rt_speed_feature_framesize_independent(
   if (speed >= 1) {
     sf->allow_txfm_domain_distortion = 1;
     sf->tx_domain_thresh = 0.0;
-    sf->allow_quant_coeff_opt = 0;
-    sf->quant_opt_thresh = 0.0;
+    sf->trellis_opt_tx_rd.method = DISABLE_TRELLIS_OPT;
+    sf->trellis_opt_tx_rd.thresh = 0.0;
     sf->use_square_partition_only = !frame_is_intra_only(cm);
     sf->less_rectangular_check = 1;
     sf->tx_size_search_method =
@@ -507,7 +531,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
 
     sf->disable_filter_search_var_thresh = 50;
-    sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
+    sf->comp_inter_joint_search_iter_level = 2;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->lf_motion_threshold = LOW_MOTION_THRESHOLD;
     sf->adjust_partitioning_from_last_frame = 1;
@@ -631,7 +655,7 @@ static void set_rt_speed_feature_framesize_independent(
       sf->use_altref_onepass = 1;
       sf->use_compound_nonrd_pickmode = 1;
     }
-    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 1280 * 720) sf->cb_pred_filter_search = 2;
     if (!cpi->external_resize) sf->use_source_sad = 1;
   }
 
@@ -652,7 +676,7 @@ static void set_rt_speed_feature_framesize_independent(
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+        CHECK_MEM_ERROR(&cm->error, cpi->content_state_sb_fd,
                         (uint8_t *)vpx_calloc(
                             (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                             sizeof(uint8_t)));
@@ -721,7 +745,7 @@ static void set_rt_speed_feature_framesize_independent(
     if (cpi->use_svc && svc->use_gf_temporal_ref_current_layer &&
         svc->temporal_layer_id > 0)
       cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
-    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 640 * 480) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 8) {
@@ -765,7 +789,7 @@ static void set_rt_speed_feature_framesize_independent(
     }
     sf->limit_newmv_early_exit = 0;
     sf->use_simple_block_yrd = 1;
-    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 1;
+    if (cm->width * cm->height > 352 * 288) sf->cb_pred_filter_search = 2;
   }
 
   if (speed >= 9) {
@@ -775,7 +799,7 @@ static void set_rt_speed_feature_framesize_independent(
       for (i = 0; i < BLOCK_SIZES; ++i)
         sf->intra_y_mode_bsize_mask[i] = INTRA_DC;
     }
-    sf->cb_pred_filter_search = 1;
+    sf->cb_pred_filter_search = 2;
     sf->mv.enable_adaptive_subpel_force_stop = 1;
     sf->mv.adapt_subpel_force_stop.mv_thresh = 1;
     sf->mv.adapt_subpel_force_stop.force_stop_below = QUARTER_PEL;
@@ -808,13 +832,13 @@ static void set_rt_speed_feature_framesize_independent(
     }
     if (cpi->count_arf_frame_usage == NULL) {
       CHECK_MEM_ERROR(
-          cm, cpi->count_arf_frame_usage,
+          &cm->error, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_arf_frame_usage)));
     }
     if (cpi->count_lastgolden_frame_usage == NULL)
       CHECK_MEM_ERROR(
-          cm, cpi->count_lastgolden_frame_usage,
+          &cm->error, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
                                 sizeof(*cpi->count_lastgolden_frame_usage)));
   }
@@ -835,6 +859,11 @@ static void set_rt_speed_feature_framesize_independent(
   // off for now.
   if (speed <= 3 && cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
     cpi->oxcf.aq_mode = 0;
+  // For all speeds for rt mode: if the deadline mode changed (was good/best
+  // quality on previous frame and now is realtime) set nonrd_keyframe to 1 to
+  // avoid entering rd pickmode. This causes issues, such as: b/310663186.
+  if (cpi->oxcf.mode != cpi->deadline_mode_previous_frame)
+    sf->nonrd_keyframe = 1;
 }
 
 void vp9_set_speed_features_framesize_dependent(VP9_COMP *cpi, int speed) {
@@ -904,14 +933,17 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->coeff_prob_appx_step = 1;
   sf->mv.auto_mv_step_size = 0;
   sf->mv.fullpel_search_step_param = 6;
-  sf->comp_inter_joint_search_thresh = BLOCK_4X4;
+  sf->mv.use_downsampled_sad = 0;
+  sf->comp_inter_joint_search_iter_level = 0;
   sf->tx_size_search_method = USE_FULL_RD;
   sf->use_lp32x32fdct = 0;
   sf->adaptive_motion_search = 0;
   sf->enhanced_full_pixel_motion_search = 1;
   sf->adaptive_pred_interp_filter = 0;
   sf->adaptive_mode_search = 0;
+  sf->prune_single_mode_based_on_mv_diff_mode_rate = 0;
   sf->cb_pred_filter_search = 0;
+  sf->early_term_interp_search_plane_rd = 0;
   sf->cb_partition_search = 0;
   sf->motion_field_mode_search = 0;
   sf->alt_ref_search_fp = 0;
@@ -936,8 +968,9 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->adaptive_interp_filter_search = 0;
   sf->allow_txfm_domain_distortion = 0;
   sf->tx_domain_thresh = 99.0;
-  sf->allow_quant_coeff_opt = sf->optimize_coefficients;
-  sf->quant_opt_thresh = 99.0;
+  sf->trellis_opt_tx_rd.method =
+      sf->optimize_coefficients ? ENABLE_TRELLIS_OPT : DISABLE_TRELLIS_OPT;
+  sf->trellis_opt_tx_rd.thresh = 99.0;
   sf->allow_acl = 1;
   sf->enable_tpl_model = oxcf->enable_tpl_model;
   sf->prune_ref_frame_for_rect_partitions = 0;
@@ -991,10 +1024,14 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi, int speed) {
   sf->exhaustive_searches_thresh =
       (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
                                                               : INT_MAX;
-  if (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) {
+  {
+    const int mesh_density_level =
+        (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? 0 : 1;
     for (i = 0; i < MAX_MESH_STEP; ++i) {
-      sf->mesh_patterns[i].range = best_quality_mesh_pattern[i].range;
-      sf->mesh_patterns[i].interval = best_quality_mesh_pattern[i].interval;
+      sf->mesh_patterns[i].range =
+          best_quality_mesh_pattern[mesh_density_level][i].range;
+      sf->mesh_patterns[i].interval =
+          best_quality_mesh_pattern[mesh_density_level][i].interval;
     }
   }
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index c2ae970b7..941de639a 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -210,6 +210,10 @@ typedef struct MV_SPEED_FEATURES {
 
   // This variable sets the step_param used in full pel motion search.
   int fullpel_search_step_param;
+
+  // Whether to downsample the rows in sad calculation during motion search.
+  // This is only active when there are at least 8 rows.
+  int use_downsampled_sad;
 } MV_SPEED_FEATURES;
 
 typedef struct PARTITION_SEARCH_BREAKOUT_THR {
@@ -246,6 +250,24 @@ typedef enum {
   USE_8_TAPS_SHARP,
 } SUBPEL_SEARCH_TYPE;
 
+typedef enum {
+  // Disable trellis coefficient optimization
+  DISABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization
+  ENABLE_TRELLIS_OPT,
+  // Enable trellis coefficient optimization based on source variance of the
+  // prediction block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_SRC_VAR,
+  // Enable trellis coefficient optimization based on residual mse of the
+  // transform block during transform RD
+  ENABLE_TRELLIS_OPT_TX_RD_RESIDUAL_MSE,
+} ENABLE_TRELLIS_OPT_METHOD;
+
+typedef struct TRELLIS_OPT_CONTROL {
+  ENABLE_TRELLIS_OPT_METHOD method;
+  double thresh;
+} TRELLIS_OPT_CONTROL;
+
 typedef struct SPEED_FEATURES {
   MV_SPEED_FEATURES mv;
 
@@ -264,11 +286,20 @@ typedef struct SPEED_FEATURES {
   // adds overhead.
   int static_segmentation;
 
-  // If 1 we iterate finding a best reference for 2 ref frames together - via
-  // a log search that iterates 4 times (check around mv for last for best
-  // error of combined predictor then check around mv for alt). If 0 we
-  // we just use the best motion vector found for each frame by itself.
-  BLOCK_SIZE comp_inter_joint_search_thresh;
+  // The best compound predictor is found using an iterative log search process
+  // that searches for best ref0 mv using error of combined predictor and then
+  // searches for best ref1 mv. This sf determines the number of iterations of
+  // this process based on block size. The sf becomes more aggressive from level
+  // 0 to 2. The following table indicates the number of iterations w.r.t bsize:
+  //  -----------------------------------------------
+  // |sf (level)|bsize < 8X8| [8X8, 16X16] | > 16X16 |
+  // |    0     |     4     |      4       |    4    |
+  // |    1     |     0     |      2       |    4    |
+  // |    2     |     0     |      0       |    0    |
+  //  -----------------------------------------------
+  // Here, 0 iterations indicate using the best single motion vector selected
+  // for each ref frame without any iterative refinement.
+  int comp_inter_joint_search_iter_level;
 
   // This variable is used to cap the maximum number of times we skip testing a
   // mode to be evaluated. A high value means we will be faster.
@@ -292,8 +323,8 @@ typedef struct SPEED_FEATURES {
   int coeff_prob_appx_step;
 
   // Enable uniform quantizer followed by trellis coefficient optimization
-  int allow_quant_coeff_opt;
-  double quant_opt_thresh;
+  // during transform RD
+  TRELLIS_OPT_CONTROL trellis_opt_tx_rd;
 
   // Enable asymptotic closed-loop encoding decision for key frame and
   // alternate reference frames.
@@ -399,9 +430,21 @@ typedef struct SPEED_FEATURES {
   // Adaptive prediction mode search
   int adaptive_mode_search;
 
-  // Chessboard pattern prediction filter type search
+  // Prune NEAREST and ZEROMV single reference modes based on motion vector
+  // difference and mode rate
+  int prune_single_mode_based_on_mv_diff_mode_rate;
+
+  // Chessboard pattern prediction for interp filter. Aggressiveness increases
+  // with levels.
+  // 0: disable
+  // 1: cb pattern in eval when filter is not switchable
+  // 2: cb pattern prediction for filter search
   int cb_pred_filter_search;
 
+  // This variable enables an early termination of interpolation filter eval
+  // based on the current rd cost after processing each plane
+  int early_term_interp_search_plane_rd;
+
   int cb_partition_search;
 
   int motion_field_mode_search;
@@ -600,7 +643,7 @@ typedef struct SPEED_FEATURES {
   // Use machine learning based partition search.
   int nonrd_use_ml_partition;
 
-  // Multiplier for base thresold for variance partitioning.
+  // Multiplier for base threshold for variance partitioning.
   int variance_part_thresh_mult;
 
   // Force subpel motion filter to always use SMOOTH_FILTER.
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 7e9435fb5..fff6d25de 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -107,7 +107,6 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
       int layer = LAYER_IDS_TO_IDX(sl, tl, oxcf->ts_number_layers);
       LAYER_CONTEXT *const lc = &svc->layer_context[layer];
       RATE_CONTROL *const lrc = &lc->rc;
-      int i;
       lc->current_video_frame_in_layer = 0;
       lc->layer_size = 0;
       lc->frames_from_key_frame = 0;
@@ -164,17 +163,17 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
         lc->actual_num_seg1_blocks = 0;
         lc->actual_num_seg2_blocks = 0;
         lc->counter_encode_maxq_scene_change = 0;
-        CHECK_MEM_ERROR(cm, lc->map,
+        CHECK_MEM_ERROR(&cm->error, lc->map,
                         vpx_malloc(mi_rows * mi_cols * sizeof(*lc->map)));
         memset(lc->map, 0, mi_rows * mi_cols);
         last_coded_q_map_size =
             mi_rows * mi_cols * sizeof(*lc->last_coded_q_map);
-        CHECK_MEM_ERROR(cm, lc->last_coded_q_map,
+        CHECK_MEM_ERROR(&cm->error, lc->last_coded_q_map,
                         vpx_malloc(last_coded_q_map_size));
         assert(MAXQ <= 255);
         memset(lc->last_coded_q_map, MAXQ, last_coded_q_map_size);
         consec_zero_mv_size = mi_rows * mi_cols * sizeof(*lc->consec_zero_mv);
-        CHECK_MEM_ERROR(cm, lc->consec_zero_mv,
+        CHECK_MEM_ERROR(&cm->error, lc->consec_zero_mv,
                         vpx_malloc(consec_zero_mv_size));
         memset(lc->consec_zero_mv, 0, consec_zero_mv_size);
       }
@@ -220,18 +219,21 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
         RATE_CONTROL *const lrc = &lc->rc;
 
         lc->spatial_layer_target_bandwidth = spatial_layer_target;
-        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        if (target_bandwidth != 0) {
+          bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+        }
         lrc->starting_buffer_level =
-            (int64_t)(rc->starting_buffer_level * bitrate_alloc);
+            (int64_t)(rc->starting_buffer_level * bitrate_alloc + 0.5);
         lrc->optimal_buffer_level =
-            (int64_t)(rc->optimal_buffer_level * bitrate_alloc);
+            (int64_t)(rc->optimal_buffer_level * bitrate_alloc + 0.5);
         lrc->maximum_buffer_size =
-            (int64_t)(rc->maximum_buffer_size * bitrate_alloc);
+            (int64_t)(rc->maximum_buffer_size * bitrate_alloc + 0.5);
         lrc->bits_off_target =
             VPXMIN(lrc->bits_off_target, lrc->maximum_buffer_size);
         lrc->buffer_level = VPXMIN(lrc->buffer_level, lrc->maximum_buffer_size);
         lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-        lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+        lrc->avg_frame_bandwidth =
+            (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
         lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
         lrc->worst_quality = rc->worst_quality;
         lrc->best_quality = rc->best_quality;
@@ -252,7 +254,9 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 
       lc->target_bandwidth = oxcf->layer_target_bitrate[layer];
 
-      bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      if (target_bandwidth != 0) {
+        bitrate_alloc = (float)lc->target_bandwidth / target_bandwidth;
+      }
       // Update buffer-related quantities.
       lrc->starting_buffer_level =
           (int64_t)(rc->starting_buffer_level * bitrate_alloc);
@@ -269,7 +273,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
       } else {
         lc->framerate = cpi->framerate;
       }
-      lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+      lrc->avg_frame_bandwidth =
+          (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
       lrc->max_frame_bandwidth = rc->max_frame_bandwidth;
       // Update qp-related quantities.
       lrc->worst_quality = rc->worst_quality;
@@ -311,7 +316,8 @@ void vp9_update_temporal_layer_framerate(VP9_COMP *const cpi) {
   const int tl = svc->temporal_layer_id;
 
   lc->framerate = cpi->framerate / oxcf->ts_rate_decimator[tl];
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->max_frame_bandwidth = cpi->rc.max_frame_bandwidth;
   // Update the average layer frame size (non-cumulative per-frame-bw).
   if (tl == 0) {
@@ -333,7 +339,8 @@ void vp9_update_spatial_layer_framerate(VP9_COMP *const cpi, double framerate) {
   RATE_CONTROL *const lrc = &lc->rc;
 
   lc->framerate = framerate;
-  lrc->avg_frame_bandwidth = (int)(lc->target_bandwidth / lc->framerate);
+  lrc->avg_frame_bandwidth =
+      (int)VPXMIN(lc->target_bandwidth / lc->framerate, INT_MAX);
   lrc->min_frame_bandwidth =
       (int)(lrc->avg_frame_bandwidth * oxcf->two_pass_vbrmin_section / 100);
   lrc->max_frame_bandwidth = (int)(((int64_t)lrc->avg_frame_bandwidth *
@@ -389,6 +396,8 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
   lc->twopass = cpi->twopass;
   lc->target_bandwidth = (int)oxcf->target_bandwidth;
   lc->alt_ref_source = cpi->alt_ref_source;
+  lc->frame_qp = cpi->common.base_qindex;
+  lc->MBs = cpi->common.MBs;
 
   // For spatial-svc, allow cyclic-refresh to be applied on the spatial layers,
   // for the base temporal layer.
@@ -408,6 +417,9 @@ void vp9_save_layer_context(VP9_COMP *const cpi) {
     lc->actual_num_seg1_blocks = cr->actual_num_seg1_blocks;
     lc->actual_num_seg2_blocks = cr->actual_num_seg2_blocks;
     lc->counter_encode_maxq_scene_change = cr->counter_encode_maxq_scene_change;
+    lc->qindex_delta[0] = cr->qindex_delta[0];
+    lc->qindex_delta[1] = cr->qindex_delta[1];
+    lc->qindex_delta[2] = cr->qindex_delta[2];
   }
 }
 
@@ -790,9 +802,9 @@ int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
       for (sl = svc->number_spatial_layers - 1;
            sl >= svc->first_spatial_layer_to_encode; sl--) {
         int layer = sl * svc->number_temporal_layers + svc->temporal_layer_id;
-        LAYER_CONTEXT *const lc = &svc->layer_context[layer];
-        cpi->rc = lc->rc;
-        cpi->oxcf.target_bandwidth = lc->target_bandwidth;
+        LAYER_CONTEXT *const sl_lc = &svc->layer_context[layer];
+        cpi->rc = sl_lc->rc;
+        cpi->oxcf.target_bandwidth = sl_lc->target_bandwidth;
         if (vp9_test_drop(cpi)) {
           int sl2;
           // Set flag to force drop in encoding for this mode.
@@ -1041,17 +1053,17 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
   int sl, tl;
   for (sl = 0; sl < svc->number_spatial_layers; ++sl) {
     // Check for reset based on avg_frame_bandwidth for spatial layer sl.
-    int layer = LAYER_IDS_TO_IDX(sl, svc->number_temporal_layers - 1,
-                                 svc->number_temporal_layers);
-    LAYER_CONTEXT *lc = &svc->layer_context[layer];
+    const int spatial_layer_idx = LAYER_IDS_TO_IDX(
+        sl, svc->number_temporal_layers - 1, svc->number_temporal_layers);
+    LAYER_CONTEXT *lc = &svc->layer_context[spatial_layer_idx];
     RATE_CONTROL *lrc = &lc->rc;
     if (lrc->avg_frame_bandwidth > (3 * lrc->last_avg_frame_bandwidth >> 1) ||
         lrc->avg_frame_bandwidth < (lrc->last_avg_frame_bandwidth >> 1)) {
       // Reset for all temporal layers with spatial layer sl.
       for (tl = 0; tl < svc->number_temporal_layers; ++tl) {
-        int layer = LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
-        LAYER_CONTEXT *lc = &svc->layer_context[layer];
-        RATE_CONTROL *lrc = &lc->rc;
+        int temporal_layer_idx =
+            LAYER_IDS_TO_IDX(sl, tl, svc->number_temporal_layers);
+        lrc = &svc->layer_context[temporal_layer_idx].rc;
         lrc->rc_1_frame = 0;
         lrc->rc_2_frame = 0;
         lrc->bits_off_target = lrc->optimal_buffer_level;
@@ -1137,7 +1149,7 @@ void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
 void vp9_svc_assert_constraints_pattern(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
   // For fixed/non-flexible mode, the following constraint are expected,
-  // when inter-layer prediciton is on (default).
+  // when inter-layer prediction is on (default).
   if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_ON &&
       svc->framedrop_mode != LAYER_DROP) {
@@ -1338,3 +1350,27 @@ void vp9_svc_adjust_avg_frame_qindex(VP9_COMP *const cpi) {
     }
   }
 }
+
+// SVC: skip encoding of enhancement layer if the layer target bandwidth = 0.
+// No need to set svc.skip_enhancement_layer if whole superframe will be
+// dropped.
+int vp9_svc_check_skip_enhancement_layer(VP9_COMP *const cpi) {
+  if (cpi->use_svc && cpi->svc.spatial_layer_id > 0 &&
+      cpi->oxcf.target_bandwidth == 0 &&
+      !(cpi->svc.framedrop_mode != LAYER_DROP &&
+        (cpi->svc.framedrop_mode != CONSTRAINED_FROM_ABOVE_DROP ||
+         cpi->svc
+             .force_drop_constrained_from_above[cpi->svc.number_spatial_layers -
+                                                1]) &&
+        cpi->svc.drop_spatial_layer[0])) {
+    cpi->svc.skip_enhancement_layer = 1;
+    vp9_rc_postencode_update_drop_frame(cpi);
+    cpi->ext_refresh_frame_flags_pending = 0;
+    cpi->last_frame_dropped = 1;
+    cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 1;
+    cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = 1;
+    vp9_inc_frame_in_layer(cpi);
+    return 1;
+  }
+  return 0;
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index c7328cf57..388a02789 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -70,8 +70,11 @@ typedef struct {
   int actual_num_seg1_blocks;
   int actual_num_seg2_blocks;
   int counter_encode_maxq_scene_change;
+  int qindex_delta[3];
   uint8_t speed;
   int loopfilter_ctrl;
+  int frame_qp;
+  int MBs;
 } LAYER_CONTEXT;
 
 typedef struct SVC {
@@ -278,6 +281,8 @@ void vp9_svc_update_ref_frame(struct VP9_COMP *const cpi);
 void vp9_svc_adjust_frame_rate(struct VP9_COMP *const cpi);
 
 void vp9_svc_adjust_avg_frame_qindex(struct VP9_COMP *const cpi);
+
+int vp9_svc_check_skip_enhancement_layer(struct VP9_COMP *const cpi);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 8af30c42a..986553a4a 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -450,8 +450,6 @@ void vp9_highbd_apply_temporal_filter_c(
   // Apply the filter to luma
   for (row = 0; row < (int)block_height; row++) {
     for (col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
       const int filter_weight = get_filter_weight(
           row, col, block_height, block_width, blk_fw, use_32x32);
 
@@ -476,6 +474,8 @@ void vp9_highbd_apply_temporal_filter_c(
 
       // Sum the corresponding uv pixels to the current y modifier
       // Note we are rounding down instead of rounding to the nearest pixel.
+      uv_row = row >> ss_y;
+      uv_col = col >> ss_x;
       y_mod += u_diff_sse[uv_row * uv_diff_stride + uv_col];
       y_mod += v_diff_sse[uv_row * uv_diff_stride + uv_col];
 
diff --git a/vp9/encoder/x86/temporal_filter_constants.h b/vp9/encoder/vp9_temporal_filter_constants.h
index 7dcedda19..8776dfc06 100644
--- a/vp9/encoder/x86/temporal_filter_constants.h
+++ b/vp9/encoder/vp9_temporal_filter_constants.h
@@ -1,5 +1,5 @@
 /*
- *  Copyright (c) 2019 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#ifndef VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
+#define VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
 #include "./vpx_config.h"
 
 // Division using multiplication and shifting. The C implementation does:
@@ -407,4 +407,4 @@ static const uint32_t
 
 #define DIST_STRIDE ((BW) + 2)
 
-#endif  // VPX_VP9_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
+#endif  // VPX_VP9_ENCODER_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 814d769be..6c6c04493 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -364,7 +364,7 @@ static void tokenize_b(int plane, int block, int row, int col,
   const PLANE_TYPE type = get_plane_type(plane);
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   const int16_t *scan, *nb;
-  const scan_order *so;
+  const ScanOrder *so;
   const int ref = is_inter_block(mi);
   unsigned int(*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
diff --git a/vp9/encoder/vp9_tpl_model.c b/vp9/encoder/vp9_tpl_model.c
new file mode 100644
index 000000000..b8910370e
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.c
@@ -0,0 +1,1541 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_dsp_rtcd.h"
+#if CONFIG_NON_GREEDY_MV
+#include "vp9/common/vp9_mvref_common.h"
+#endif
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_tpl_model.h"
+#include "vpx/internal/vpx_codec_internal.h"
+#include "vpx/vpx_codec.h"
+
+static int init_gop_frames(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                           const GF_GROUP *gf_group, int *tpl_group_frames) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame_idx = 0;
+  int i;
+  int gld_index = -1;
+  int alt_index = -1;
+  int lst_index = -1;
+  int arf_index_stack[MAX_ARF_LAYERS];
+  int arf_stack_size = 0;
+  int extend_frame_count = 0;
+  int pframe_qindex = cpi->tpl_stats[2].base_qindex;
+  int frame_gop_offset = 0;
+
+  RefCntBuffer *frame_bufs = cm->buffer_pool->frame_bufs;
+  int8_t recon_frame_index[REFS_PER_FRAME + MAX_ARF_LAYERS];
+
+  memset(recon_frame_index, -1, sizeof(recon_frame_index));
+  stack_init(arf_index_stack, MAX_ARF_LAYERS);
+
+  for (i = 0; i < FRAME_BUFFERS; ++i) {
+    if (frame_bufs[i].ref_count == 0) {
+      alloc_frame_mvs(cm, i);
+      if (vpx_realloc_frame_buffer(&frame_bufs[i].buf, cm->width, cm->height,
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffer");
+
+      recon_frame_index[frame_idx] = i;
+      ++frame_idx;
+
+      if (frame_idx >= REFS_PER_FRAME + cpi->oxcf.enable_auto_arf) break;
+    }
+  }
+
+  for (i = 0; i < REFS_PER_FRAME + 1; ++i) {
+    assert(recon_frame_index[i] >= 0);
+    cpi->tpl_recon_frames[i] = &frame_bufs[recon_frame_index[i]].buf;
+  }
+
+  *tpl_group_frames = 0;
+
+  // Initialize Golden reference frame.
+  gf_picture[0].frame = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  for (i = 0; i < 3; ++i) gf_picture[0].ref_frame[i] = -1;
+  gf_picture[0].update_type = gf_group->update_type[0];
+  gld_index = 0;
+  ++*tpl_group_frames;
+
+  // Initialize base layer ARF frame
+  gf_picture[1].frame = cpi->Source;
+  gf_picture[1].ref_frame[0] = gld_index;
+  gf_picture[1].ref_frame[1] = lst_index;
+  gf_picture[1].ref_frame[2] = alt_index;
+  gf_picture[1].update_type = gf_group->update_type[1];
+  alt_index = 1;
+  ++*tpl_group_frames;
+
+  // Initialize P frames
+  for (frame_idx = 2; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    struct lookahead_entry *buf;
+    frame_gop_offset = gf_group->frame_gop_index[frame_idx];
+    buf = vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = gf_group->update_type[frame_idx];
+
+    switch (gf_group->update_type[frame_idx]) {
+      case ARF_UPDATE:
+        stack_push(arf_index_stack, alt_index, arf_stack_size);
+        ++arf_stack_size;
+        alt_index = frame_idx;
+        break;
+      case LF_UPDATE: lst_index = frame_idx; break;
+      case OVERLAY_UPDATE:
+        gld_index = frame_idx;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      case USE_BUF_FRAME:
+        lst_index = alt_index;
+        alt_index = stack_pop(arf_index_stack, arf_stack_size);
+        --arf_stack_size;
+        break;
+      default: break;
+    }
+
+    ++*tpl_group_frames;
+
+    // The length of group of pictures is baseline_gf_interval, plus the
+    // beginning golden frame from last GOP, plus the last overlay frame in
+    // the same GOP.
+    if (frame_idx == gf_group->gf_group_size) break;
+  }
+
+  alt_index = -1;
+  ++frame_idx;
+  ++frame_gop_offset;
+
+  // Extend two frames outside the current gf group.
+  for (; frame_idx < MAX_LAG_BUFFERS && extend_frame_count < 2; ++frame_idx) {
+    struct lookahead_entry *buf =
+        vp9_lookahead_peek(cpi->lookahead, frame_gop_offset - 1);
+
+    if (buf == NULL) break;
+
+    cpi->tpl_stats[frame_idx].base_qindex = pframe_qindex;
+
+    gf_picture[frame_idx].frame = &buf->img;
+    gf_picture[frame_idx].ref_frame[0] = gld_index;
+    gf_picture[frame_idx].ref_frame[1] = lst_index;
+    gf_picture[frame_idx].ref_frame[2] = alt_index;
+    gf_picture[frame_idx].update_type = LF_UPDATE;
+    lst_index = frame_idx;
+    ++*tpl_group_frames;
+    ++extend_frame_count;
+    ++frame_gop_offset;
+  }
+
+  return extend_frame_count;
+}
+
+static void init_tpl_stats(VP9_COMP *cpi) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < MAX_ARF_GOP_SIZE; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    memset(tpl_frame->tpl_stats_ptr, 0,
+           tpl_frame->height * tpl_frame->width *
+               sizeof(*tpl_frame->tpl_stats_ptr));
+    tpl_frame->is_valid = 0;
+  }
+}
+
+static void free_tpl_frame_stats_list(VpxTplGopStats *tpl_gop_stats) {
+  int frame_idx;
+  for (frame_idx = 0; frame_idx < tpl_gop_stats->size; ++frame_idx) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
+
+static void init_tpl_stats_before_propagation(
+    struct vpx_internal_error_info *error_info, VpxTplGopStats *tpl_gop_stats,
+    TplDepFrame *tpl_stats, int tpl_gop_frames, int frame_width,
+    int frame_height) {
+  int frame_idx;
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  CHECK_MEM_ERROR(
+      error_info, tpl_gop_stats->frame_stats_list,
+      vpx_calloc(tpl_gop_frames, sizeof(*tpl_gop_stats->frame_stats_list)));
+  tpl_gop_stats->size = tpl_gop_frames;
+  for (frame_idx = 0; frame_idx < tpl_gop_frames; ++frame_idx) {
+    const int mi_rows = tpl_stats[frame_idx].height;
+    const int mi_cols = tpl_stats[frame_idx].width;
+    CHECK_MEM_ERROR(
+        error_info, tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list,
+        vpx_calloc(
+            mi_rows * mi_cols,
+            sizeof(
+                *tpl_gop_stats->frame_stats_list[frame_idx].block_stats_list)));
+    tpl_gop_stats->frame_stats_list[frame_idx].num_blocks = mi_rows * mi_cols;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_width = frame_width;
+    tpl_gop_stats->frame_stats_list[frame_idx].frame_height = frame_height;
+  }
+}
+
+#if CONFIG_NON_GREEDY_MV
+static uint32_t full_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                         MotionField *motion_field,
+                                         int frame_idx, uint8_t *cur_frame_buf,
+                                         uint8_t *ref_frame_buf, int stride,
+                                         BLOCK_SIZE bsize, int mi_row,
+                                         int mi_col, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  int step_param;
+  uint32_t bestsme = UINT_MAX;
+  const MvLimits tmp_mv_limits = x->mv_limits;
+  // lambda is used to adjust the importance of motion vector consistency.
+  // TODO(angiebird): Figure out lambda's proper value.
+  const int lambda = cpi->tpl_stats[frame_idx].lambda;
+  int_mv nb_full_mvs[NB_MVS_NUM];
+  int nb_full_mv_num;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  nb_full_mv_num =
+      vp9_prepare_nb_full_mvs(motion_field, mi_row, mi_col, nb_full_mvs);
+  vp9_full_pixel_diamond_new(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                             lambda, 1, nb_full_mvs, nb_full_mv_num, mv);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  return bestsme;
+}
+
+static uint32_t sub_pixel_motion_search(VP9_COMP *cpi, ThreadData *td,
+                                        uint8_t *cur_frame_buf,
+                                        uint8_t *ref_frame_buf, int stride,
+                                        BLOCK_SIZE bsize, MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+
+  MV best_ref_mv1 = { 0, 0 };
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+
+#else  // CONFIG_NON_GREEDY_MV
+static uint32_t motion_compensated_prediction(VP9_COMP *cpi, ThreadData *td,
+                                              uint8_t *cur_frame_buf,
+                                              uint8_t *ref_frame_buf,
+                                              int stride, BLOCK_SIZE bsize,
+                                              MV *mv) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
+  const SEARCH_METHODS search_method = NSTEP;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  uint32_t bestsme = UINT_MAX;
+  uint32_t distortion;
+  uint32_t sse;
+  int cost_list[5];
+  const MvLimits tmp_mv_limits = x->mv_limits;
+
+  MV best_ref_mv1 = { 0, 0 };
+  MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
+
+  best_ref_mv1_full.col = best_ref_mv1.col >> 3;
+  best_ref_mv1_full.row = best_ref_mv1.row >> 3;
+
+  // Setup frame pointers
+  x->plane[0].src.buf = cur_frame_buf;
+  x->plane[0].src.stride = stride;
+  xd->plane[0].pre[0].buf = ref_frame_buf;
+  xd->plane[0].pre[0].stride = stride;
+
+  step_param = mv_sf->reduce_first_step_size;
+  step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
+
+  vp9_set_mv_search_range(&x->mv_limits, &best_ref_mv1);
+
+  vp9_full_pixel_search(cpi, x, bsize, &best_ref_mv1_full, step_param,
+                        search_method, sadpb, cond_cost_list(cpi, cost_list),
+                        &best_ref_mv1, mv, 0, 0);
+
+  /* restore UMV window */
+  x->mv_limits = tmp_mv_limits;
+
+  // TODO(yunqing): may use higher tap interp filter than 2 taps.
+  // Ignore mv costing by sending NULL pointer instead of cost array
+  bestsme = cpi->find_fractional_mv_step(
+      x, mv, &best_ref_mv1, cpi->common.allow_high_precision_mv, x->errorperbit,
+      &cpi->fn_ptr[bsize], 0, mv_sf->subpel_search_level,
+      cond_cost_list(cpi, cost_list), NULL, NULL, &distortion, &sse, NULL, 0, 0,
+      USE_2_TAPS);
+
+  return bestsme;
+}
+#endif
+
+static int get_overlap_area(int grid_pos_row, int grid_pos_col, int ref_pos_row,
+                            int ref_pos_col, int block, BLOCK_SIZE bsize) {
+  int width = 0, height = 0;
+  int bw = 4 << b_width_log2_lookup[bsize];
+  int bh = 4 << b_height_log2_lookup[bsize];
+
+  switch (block) {
+    case 0:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 1:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = grid_pos_row + bh - ref_pos_row;
+      break;
+    case 2:
+      width = grid_pos_col + bw - ref_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    case 3:
+      width = ref_pos_col + bw - grid_pos_col;
+      height = ref_pos_row + bh - grid_pos_row;
+      break;
+    default: assert(0);
+  }
+
+  return width * height;
+}
+
+static int round_floor(int ref_pos, int bsize_pix) {
+  int round;
+  if (ref_pos < 0)
+    round = -(1 + (-ref_pos - 1) / bsize_pix);
+  else
+    round = ref_pos / bsize_pix;
+
+  return round;
+}
+
+static void tpl_model_store(TplDepStats *tpl_stats, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize, int stride) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr = &tpl_stats[(mi_row + idy) * stride + mi_col + idx];
+      const int64_t mc_flow = tpl_ptr->mc_flow;
+      const int64_t mc_ref_cost = tpl_ptr->mc_ref_cost;
+      *tpl_ptr = *src_stats;
+      tpl_ptr->mc_flow = mc_flow;
+      tpl_ptr->mc_ref_cost = mc_ref_cost;
+      tpl_ptr->mc_dep_cost = tpl_ptr->intra_cost + tpl_ptr->mc_flow;
+    }
+  }
+}
+
+static void tpl_store_before_propagation(VpxTplBlockStats *tpl_block_stats,
+                                         TplDepStats *tpl_stats, int mi_row,
+                                         int mi_col, BLOCK_SIZE bsize,
+                                         int stride, int64_t recon_error,
+                                         int64_t rate_cost, int ref_frame_idx) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const TplDepStats *src_stats = &tpl_stats[mi_row * stride + mi_col];
+  int idx, idy;
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      VpxTplBlockStats *tpl_block_stats_ptr =
+          &tpl_block_stats[(mi_row + idy) * stride + mi_col + idx];
+      tpl_block_stats_ptr->row = mi_row * 8;
+      tpl_block_stats_ptr->col = mi_col * 8;
+      tpl_block_stats_ptr->inter_cost = src_stats->inter_cost;
+      tpl_block_stats_ptr->intra_cost = src_stats->intra_cost;
+      tpl_block_stats_ptr->recrf_dist = recon_error << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->recrf_rate = rate_cost << TPL_DEP_COST_SCALE_LOG2;
+      tpl_block_stats_ptr->mv_r = src_stats->mv.as_mv.row;
+      tpl_block_stats_ptr->mv_c = src_stats->mv.as_mv.col;
+      tpl_block_stats_ptr->ref_frame_index = ref_frame_idx;
+    }
+  }
+}
+
+static void tpl_model_update_b(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                               int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  TplDepFrame *ref_tpl_frame = &tpl_frame[tpl_stats->ref_frame_index];
+  TplDepStats *ref_stats = ref_tpl_frame->tpl_stats_ptr;
+  MV mv = tpl_stats->mv.as_mv;
+  int mv_row = mv.row >> 3;
+  int mv_col = mv.col >> 3;
+
+  int ref_pos_row = mi_row * MI_SIZE + mv_row;
+  int ref_pos_col = mi_col * MI_SIZE + mv_col;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pix_num = bw * bh;
+
+  // top-left on grid block location in pixel
+  int grid_pos_row_base = round_floor(ref_pos_row, bh) * bh;
+  int grid_pos_col_base = round_floor(ref_pos_col, bw) * bw;
+  int block;
+
+  for (block = 0; block < 4; ++block) {
+    int grid_pos_row = grid_pos_row_base + bh * (block >> 1);
+    int grid_pos_col = grid_pos_col_base + bw * (block & 0x01);
+
+    if (grid_pos_row >= 0 && grid_pos_row < ref_tpl_frame->mi_rows * MI_SIZE &&
+        grid_pos_col >= 0 && grid_pos_col < ref_tpl_frame->mi_cols * MI_SIZE) {
+      int overlap_area = get_overlap_area(
+          grid_pos_row, grid_pos_col, ref_pos_row, ref_pos_col, block, bsize);
+      int ref_mi_row = round_floor(grid_pos_row, bh) * mi_height;
+      int ref_mi_col = round_floor(grid_pos_col, bw) * mi_width;
+
+      int64_t mc_flow = tpl_stats->mc_dep_cost -
+                        (tpl_stats->mc_dep_cost * tpl_stats->inter_cost) /
+                            tpl_stats->intra_cost;
+
+      int idx, idy;
+
+      for (idy = 0; idy < mi_height; ++idy) {
+        for (idx = 0; idx < mi_width; ++idx) {
+          TplDepStats *des_stats =
+              &ref_stats[(ref_mi_row + idy) * ref_tpl_frame->stride +
+                         (ref_mi_col + idx)];
+
+          des_stats->mc_flow += (mc_flow * overlap_area) / pix_num;
+          des_stats->mc_ref_cost +=
+              ((tpl_stats->intra_cost - tpl_stats->inter_cost) * overlap_area) /
+              pix_num;
+          assert(overlap_area >= 0);
+        }
+      }
+    }
+  }
+}
+
+static void tpl_model_update(TplDepFrame *tpl_frame, TplDepStats *tpl_stats,
+                             int mi_row, int mi_col, const BLOCK_SIZE bsize) {
+  int idx, idy;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  for (idy = 0; idy < mi_height; ++idy) {
+    for (idx = 0; idx < mi_width; ++idx) {
+      TplDepStats *tpl_ptr =
+          &tpl_stats[(mi_row + idy) * tpl_frame->stride + (mi_col + idx)];
+      tpl_model_update_b(tpl_frame, tpl_ptr, mi_row + idy, mi_col + idx,
+                         BLOCK_8X8);
+    }
+  }
+}
+
+static void get_quantize_error(MACROBLOCK *x, int plane, tran_low_t *coeff,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               TX_SIZE tx_size, int64_t *recon_error,
+                               int64_t *sse, uint16_t *eob) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const ScanOrder *const scan_order = &vp9_default_scan_orders[tx_size];
+  int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
+  const int shift = tx_size == TX_32X32 ? 0 : 2;
+
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff,
+                                 pd->dequant, eob, scan_order);
+  } else {
+    vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                          scan_order);
+  }
+#else
+  vp9_quantize_fp_32x32(coeff, pix_num, p, qcoeff, dqcoeff, pd->dequant, eob,
+                        scan_order);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *recon_error = vp9_block_error(coeff, dqcoeff, pix_num, sse) >> shift;
+  *recon_error = VPXMAX(*recon_error, 1);
+
+  *sse = (*sse) >> shift;
+  *sse = VPXMAX(*sse, 1);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size) {
+  // TODO(sdeng): Implement SIMD based high bit-depth Hadamard transforms.
+  switch (tx_size) {
+    case TX_8X8: vpx_highbd_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_highbd_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_highbd_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_8X8: vpx_hadamard_8x8(src_diff, bw, coeff); break;
+    case TX_16X16: vpx_hadamard_16x16(src_diff, bw, coeff); break;
+    case TX_32X32: vpx_hadamard_32x32(src_diff, bw, coeff); break;
+    default: assert(0);
+  }
+}
+
+static void set_mv_limits(const VP9_COMMON *cm, MACROBLOCK *x, int mi_row,
+                          int mi_col) {
+  x->mv_limits.row_min = -((mi_row * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.row_max =
+      (cm->mi_rows - 1 - mi_row) * MI_SIZE + (17 - 2 * VP9_INTERP_EXTEND);
+  x->mv_limits.col_min = -((mi_col * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND));
+  x->mv_limits.col_max =
+      ((cm->mi_cols - 1 - mi_col) * MI_SIZE) + (17 - 2 * VP9_INTERP_EXTEND);
+}
+
+static int rate_estimator(const tran_low_t *qcoeff, int eob, TX_SIZE tx_size) {
+  const ScanOrder *const scan_order = &vp9_scan_orders[tx_size][DCT_DCT];
+  int rate_cost = 1;
+  int idx;
+  assert((1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]]) >= eob);
+  for (idx = 0; idx < eob; ++idx) {
+    unsigned int abs_level = abs(qcoeff[scan_order->scan[idx]]);
+    rate_cost += get_msb(abs_level + 1) + 1 + (abs_level > 0);
+  }
+
+  return (rate_cost << VP9_PROB_COST_SHIFT);
+}
+
+static void mode_estimation(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
+                            struct scale_factors *sf, GF_PICTURE *gf_picture,
+                            int frame_idx, TplDepFrame *tpl_frame,
+                            int16_t *src_diff, tran_low_t *coeff,
+                            tran_low_t *qcoeff, tran_low_t *dqcoeff, int mi_row,
+                            int mi_col, BLOCK_SIZE bsize, TX_SIZE tx_size,
+                            YV12_BUFFER_CONFIG *ref_frame[], uint8_t *predictor,
+                            int64_t *recon_error, int64_t *rate_cost,
+                            int64_t *sse, int *ref_frame_idx) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+
+  const int bw = 4 << b_width_log2_lookup[bsize];
+  const int bh = 4 << b_height_log2_lookup[bsize];
+  const int pix_num = bw * bh;
+  int best_rf_idx = -1;
+  int_mv best_mv;
+  int64_t best_inter_cost = INT64_MAX;
+  int64_t inter_cost;
+  int rf_idx;
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
+
+  int64_t best_intra_cost = INT64_MAX;
+  int64_t intra_cost;
+  PREDICTION_MODE mode;
+  int mb_y_offset = mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  MODE_INFO mi_above, mi_left;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  TplDepStats *tpl_stats =
+      &tpl_frame->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+
+  xd->mb_to_top_edge = -((mi_row * MI_SIZE) * 8);
+  xd->mb_to_bottom_edge = ((cm->mi_rows - 1 - mi_row) * MI_SIZE) * 8;
+  xd->mb_to_left_edge = -((mi_col * MI_SIZE) * 8);
+  xd->mb_to_right_edge = ((cm->mi_cols - 1 - mi_col) * MI_SIZE) * 8;
+  xd->above_mi = (mi_row > 0) ? &mi_above : NULL;
+  xd->left_mi = (mi_col > 0) ? &mi_left : NULL;
+
+  // Intra prediction search
+  for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
+    uint8_t *src, *dst;
+    int src_stride, dst_stride;
+
+    src = xd->cur_buf->y_buffer + mb_y_offset;
+    src_stride = xd->cur_buf->y_stride;
+
+    dst = &predictor[0];
+    dst_stride = bw;
+
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+
+    vp9_predict_intra_block(xd, b_width_log2_lookup[bsize], tx_size, mode, src,
+                            src_stride, dst, dst_stride, 0, 0, 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vpx_highbd_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                                dst_stride, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst,
+                         dst_stride);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      intra_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vpx_subtract_block(bh, bw, src_diff, bw, src, src_stride, dst, dst_stride);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    intra_cost = vpx_satd(coeff, pix_num);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if (intra_cost < best_intra_cost) best_intra_cost = intra_cost;
+  }
+
+  // Motion compensated prediction
+  best_mv.as_int = 0;
+
+  set_mv_limits(cm, x, mi_row, mi_col);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    int_mv mv;
+#if CONFIG_NON_GREEDY_MV
+    MotionField *motion_field;
+#endif
+    if (ref_frame[rf_idx] == NULL) continue;
+
+#if CONFIG_NON_GREEDY_MV
+    (void)td;
+    motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+#else
+    motion_compensated_prediction(cpi, td, xd->cur_buf->y_buffer + mb_y_offset,
+                                  ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                                  xd->cur_buf->y_stride, bsize, &mv.as_mv);
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_highbd_build_inter_predictor(
+          CONVERT_TO_SHORTPTR(ref_frame[rf_idx]->y_buffer + mb_y_offset),
+          ref_frame[rf_idx]->y_stride, CONVERT_TO_SHORTPTR(&predictor[0]), bw,
+          &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE,
+          mi_row * MI_SIZE, xd->bd);
+      vpx_highbd_subtract_block(
+          bh, bw, src_diff, bw, xd->cur_buf->y_buffer + mb_y_offset,
+          xd->cur_buf->y_stride, &predictor[0], bw, xd->bd);
+      vp9_highbd_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_highbd_satd(coeff, pix_num);
+    } else {
+      vp9_build_inter_predictor(
+          ref_frame[rf_idx]->y_buffer + mb_y_offset,
+          ref_frame[rf_idx]->y_stride, &predictor[0], bw, &mv.as_mv, sf, bw, bh,
+          0, kernel, MV_PRECISION_Q3, mi_col * MI_SIZE, mi_row * MI_SIZE);
+      vpx_subtract_block(bh, bw, src_diff, bw,
+                         xd->cur_buf->y_buffer + mb_y_offset,
+                         xd->cur_buf->y_stride, &predictor[0], bw);
+      vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+      inter_cost = vpx_satd(coeff, pix_num);
+    }
+#else
+    vp9_build_inter_predictor(ref_frame[rf_idx]->y_buffer + mb_y_offset,
+                              ref_frame[rf_idx]->y_stride, &predictor[0], bw,
+                              &mv.as_mv, sf, bw, bh, 0, kernel, MV_PRECISION_Q3,
+                              mi_col * MI_SIZE, mi_row * MI_SIZE);
+    vpx_subtract_block(bh, bw, src_diff, bw,
+                       xd->cur_buf->y_buffer + mb_y_offset,
+                       xd->cur_buf->y_stride, &predictor[0], bw);
+    vp9_wht_fwd_txfm(src_diff, bw, coeff, tx_size);
+    inter_cost = vpx_satd(coeff, pix_num);
+#endif
+
+    if (inter_cost < best_inter_cost) {
+      uint16_t eob = 0;
+      best_rf_idx = rf_idx;
+      best_inter_cost = inter_cost;
+      best_mv.as_int = mv.as_int;
+      // Since best_inter_cost is initialized as INT64_MAX, recon_error and
+      // rate_cost will be calculated with the best reference frame.
+      get_quantize_error(x, 0, coeff, qcoeff, dqcoeff, tx_size, recon_error,
+                         sse, &eob);
+      *rate_cost = rate_estimator(qcoeff, eob, tx_size);
+    }
+  }
+  best_intra_cost = VPXMAX(best_intra_cost, 1);
+  best_inter_cost = VPXMIN(best_intra_cost, best_inter_cost);
+  tpl_stats->inter_cost = VPXMAX(
+      1, (best_inter_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->intra_cost = VPXMAX(
+      1, (best_intra_cost << TPL_DEP_COST_SCALE_LOG2) / (mi_height * mi_width));
+  tpl_stats->ref_frame_index = gf_picture[frame_idx].ref_frame[best_rf_idx];
+  tpl_stats->mv.as_int = best_mv.as_int;
+  *ref_frame_idx = best_rf_idx;
+}
+
+#if CONFIG_NON_GREEDY_MV
+static int get_block_src_pred_buf(MACROBLOCKD *xd, GF_PICTURE *gf_picture,
+                                  int frame_idx, int rf_idx, int mi_row,
+                                  int mi_col, struct buf_2d *src,
+                                  struct buf_2d *pre) {
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  YV12_BUFFER_CONFIG *ref_frame = NULL;
+  int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+  if (ref_frame_idx != -1) {
+    ref_frame = gf_picture[ref_frame_idx].frame;
+    src->buf = xd->cur_buf->y_buffer + mb_y_offset;
+    src->stride = xd->cur_buf->y_stride;
+    pre->buf = ref_frame->y_buffer + mb_y_offset;
+    pre->stride = ref_frame->y_stride;
+    assert(src->stride == pre->stride);
+    return 1;
+  } else {
+    printf("invalid ref_frame_idx");
+    assert(ref_frame_idx != -1);
+    return 0;
+  }
+}
+
+#define kMvPreCheckLines 5
+#define kMvPreCheckSize 15
+
+#define MV_REF_POS_NUM 3
+POSITION mv_ref_pos[MV_REF_POS_NUM] = {
+  { -1, 0 },
+  { 0, -1 },
+  { -1, -1 },
+};
+
+static int_mv *get_select_mv(VP9_COMP *cpi, TplDepFrame *tpl_frame, int mi_row,
+                             int mi_col) {
+  return &cpi->select_mv_arr[mi_row * tpl_frame->stride + mi_col];
+}
+
+static int_mv find_ref_mv(int mv_mode, VP9_COMP *cpi, TplDepFrame *tpl_frame,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  int i;
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int_mv nearest_mv, near_mv, invalid_mv;
+  nearest_mv.as_int = INVALID_MV;
+  near_mv.as_int = INVALID_MV;
+  invalid_mv.as_int = INVALID_MV;
+  for (i = 0; i < MV_REF_POS_NUM; ++i) {
+    int nb_row = mi_row + mv_ref_pos[i].row * mi_height;
+    int nb_col = mi_col + mv_ref_pos[i].col * mi_width;
+    assert(mv_ref_pos[i].row <= 0);
+    assert(mv_ref_pos[i].col <= 0);
+    if (nb_row >= 0 && nb_col >= 0) {
+      if (nearest_mv.as_int == INVALID_MV) {
+        nearest_mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+      } else {
+        int_mv mv = *get_select_mv(cpi, tpl_frame, nb_row, nb_col);
+        if (mv.as_int == nearest_mv.as_int) {
+          continue;
+        } else {
+          near_mv = mv;
+          break;
+        }
+      }
+    }
+  }
+  if (nearest_mv.as_int == INVALID_MV) {
+    nearest_mv.as_mv.row = 0;
+    nearest_mv.as_mv.col = 0;
+  }
+  if (near_mv.as_int == INVALID_MV) {
+    near_mv.as_mv.row = 0;
+    near_mv.as_mv.col = 0;
+  }
+  if (mv_mode == NEAREST_MV_MODE) {
+    return nearest_mv;
+  }
+  if (mv_mode == NEAR_MV_MODE) {
+    return near_mv;
+  }
+  assert(0);
+  return invalid_mv;
+}
+
+static int_mv get_mv_from_mv_mode(int mv_mode, VP9_COMP *cpi,
+                                  MotionField *motion_field,
+                                  TplDepFrame *tpl_frame, BLOCK_SIZE bsize,
+                                  int mi_row, int mi_col) {
+  int_mv mv;
+  switch (mv_mode) {
+    case ZERO_MV_MODE:
+      mv.as_mv.row = 0;
+      mv.as_mv.col = 0;
+      break;
+    case NEW_MV_MODE:
+      mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+      break;
+    case NEAREST_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    case NEAR_MV_MODE:
+      mv = find_ref_mv(mv_mode, cpi, tpl_frame, bsize, mi_row, mi_col);
+      break;
+    default:
+      mv.as_int = INVALID_MV;
+      assert(0);
+      break;
+  }
+  return mv;
+}
+
+static double get_mv_dist(int mv_mode, VP9_COMP *cpi, MACROBLOCKD *xd,
+                          GF_PICTURE *gf_picture, MotionField *motion_field,
+                          int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                          BLOCK_SIZE bsize, int mi_row, int mi_col,
+                          int_mv *mv) {
+  uint32_t sse;
+  struct buf_2d src;
+  struct buf_2d pre;
+  MV full_mv;
+  *mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame, bsize,
+                            mi_row, mi_col);
+  full_mv = get_full_mv(&mv->as_mv);
+  if (get_block_src_pred_buf(xd, gf_picture, frame_idx, rf_idx, mi_row, mi_col,
+                             &src, &pre)) {
+    // TODO(angiebird): Consider subpixel when computing the sse.
+    cpi->fn_ptr[bsize].vf(src.buf, src.stride, get_buf_from_mv(&pre, &full_mv),
+                          pre.stride, &sse);
+    return (double)(sse << VP9_DIST_SCALE_LOG2);
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+static int get_mv_mode_cost(int mv_mode) {
+  // TODO(angiebird): The probabilities are roughly inferred from
+  // default_inter_mode_probs. Check if there is a better way to set the
+  // probabilities.
+  const int zero_mv_prob = 16;
+  const int new_mv_prob = 24 * 1;
+  const int ref_mv_prob = 256 - zero_mv_prob - new_mv_prob;
+  assert(zero_mv_prob + new_mv_prob + ref_mv_prob == 256);
+  switch (mv_mode) {
+    case ZERO_MV_MODE: return vp9_prob_cost[zero_mv_prob]; break;
+    case NEW_MV_MODE: return vp9_prob_cost[new_mv_prob]; break;
+    case NEAREST_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    case NEAR_MV_MODE: return vp9_prob_cost[ref_mv_prob]; break;
+    default: assert(0); return -1;
+  }
+}
+
+static INLINE double get_mv_diff_cost(MV *new_mv, MV *ref_mv) {
+  double mv_diff_cost = log2(1 + abs(new_mv->row - ref_mv->row)) +
+                        log2(1 + abs(new_mv->col - ref_mv->col));
+  mv_diff_cost *= (1 << VP9_PROB_COST_SHIFT);
+  return mv_diff_cost;
+}
+static double get_mv_cost(int mv_mode, VP9_COMP *cpi, MotionField *motion_field,
+                          TplDepFrame *tpl_frame, BLOCK_SIZE bsize, int mi_row,
+                          int mi_col) {
+  double mv_cost = get_mv_mode_cost(mv_mode);
+  if (mv_mode == NEW_MV_MODE) {
+    MV new_mv = get_mv_from_mv_mode(mv_mode, cpi, motion_field, tpl_frame,
+                                    bsize, mi_row, mi_col)
+                    .as_mv;
+    MV nearest_mv = get_mv_from_mv_mode(NEAREST_MV_MODE, cpi, motion_field,
+                                        tpl_frame, bsize, mi_row, mi_col)
+                        .as_mv;
+    MV near_mv = get_mv_from_mv_mode(NEAR_MV_MODE, cpi, motion_field, tpl_frame,
+                                     bsize, mi_row, mi_col)
+                     .as_mv;
+    double nearest_cost = get_mv_diff_cost(&new_mv, &nearest_mv);
+    double near_cost = get_mv_diff_cost(&new_mv, &near_mv);
+    mv_cost += nearest_cost < near_cost ? nearest_cost : near_cost;
+  }
+  return mv_cost;
+}
+
+static double eval_mv_mode(int mv_mode, VP9_COMP *cpi, MACROBLOCK *x,
+                           GF_PICTURE *gf_picture, MotionField *motion_field,
+                           int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                           BLOCK_SIZE bsize, int mi_row, int mi_col,
+                           int_mv *mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  double mv_dist =
+      get_mv_dist(mv_mode, cpi, xd, gf_picture, motion_field, frame_idx,
+                  tpl_frame, rf_idx, bsize, mi_row, mi_col, mv);
+  double mv_cost =
+      get_mv_cost(mv_mode, cpi, motion_field, tpl_frame, bsize, mi_row, mi_col);
+  double mult = 180;
+
+  return mv_cost + mult * log2f(1 + mv_dist);
+}
+
+static int find_best_ref_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                 GF_PICTURE *gf_picture,
+                                 MotionField *motion_field, int frame_idx,
+                                 TplDepFrame *tpl_frame, int rf_idx,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                 double *rd, int_mv *mv) {
+  int best_mv_mode = ZERO_MV_MODE;
+  int update = 0;
+  int mv_mode;
+  *rd = 0;
+  for (mv_mode = 0; mv_mode < MAX_MV_MODE; ++mv_mode) {
+    double this_rd;
+    int_mv this_mv;
+    if (mv_mode == NEW_MV_MODE) {
+      continue;
+    }
+    this_rd = eval_mv_mode(mv_mode, cpi, x, gf_picture, motion_field, frame_idx,
+                           tpl_frame, rf_idx, bsize, mi_row, mi_col, &this_mv);
+    if (update == 0) {
+      *rd = this_rd;
+      *mv = this_mv;
+      best_mv_mode = mv_mode;
+      update = 1;
+    } else {
+      if (this_rd < *rd) {
+        *rd = this_rd;
+        *mv = this_mv;
+        best_mv_mode = mv_mode;
+      }
+    }
+  }
+  return best_mv_mode;
+}
+
+static void predict_mv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                            GF_PICTURE *gf_picture, MotionField *motion_field,
+                            int frame_idx, TplDepFrame *tpl_frame, int rf_idx,
+                            BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  int tmp_mv_mode_arr[kMvPreCheckSize];
+  int *mv_mode_arr = tpl_frame->mv_mode_arr[rf_idx];
+  double *rd_diff_arr = tpl_frame->rd_diff_arr[rf_idx];
+  int_mv *select_mv_arr = cpi->select_mv_arr;
+  int_mv tmp_select_mv_arr[kMvPreCheckSize];
+  int stride = tpl_frame->stride;
+  double new_mv_rd = 0;
+  double no_new_mv_rd = 0;
+  double this_new_mv_rd = 0;
+  double this_no_new_mv_rd = 0;
+  int idx;
+  int tmp_idx;
+  assert(kMvPreCheckSize == (kMvPreCheckLines * (kMvPreCheckLines + 1)) >> 1);
+
+  // no new mv
+  // diagonal scan order
+  tmp_idx = 0;
+  for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        if (r == 0 && c == 0) {
+          this_no_new_mv_rd = this_rd;
+        }
+        no_new_mv_rd += this_rd;
+        tmp_mv_mode_arr[tmp_idx] = mv_mode_arr[nb_row * stride + nb_col];
+        tmp_select_mv_arr[tmp_idx] = select_mv_arr[nb_row * stride + nb_col];
+        ++tmp_idx;
+      }
+    }
+  }
+
+  // new mv
+  mv_mode_arr[mi_row * stride + mi_col] = NEW_MV_MODE;
+  this_new_mv_rd = eval_mv_mode(
+      NEW_MV_MODE, cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+      rf_idx, bsize, mi_row, mi_col, &select_mv_arr[mi_row * stride + mi_col]);
+  new_mv_rd = this_new_mv_rd;
+  // We start from idx = 1 because idx = 0 is evaluated as NEW_MV_MODE
+  // beforehand.
+  for (idx = 1; idx < kMvPreCheckLines; ++idx) {
+    int r;
+    for (r = 0; r <= idx; ++r) {
+      int c = idx - r;
+      int nb_row = mi_row + r * mi_height;
+      int nb_col = mi_col + c * mi_width;
+      if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+        double this_rd;
+        int_mv *mv = &select_mv_arr[nb_row * stride + nb_col];
+        mv_mode_arr[nb_row * stride + nb_col] = find_best_ref_mv_mode(
+            cpi, x, gf_picture, motion_field, frame_idx, tpl_frame, rf_idx,
+            bsize, nb_row, nb_col, &this_rd, mv);
+        new_mv_rd += this_rd;
+      }
+    }
+  }
+
+  // update best_mv_mode
+  tmp_idx = 0;
+  if (no_new_mv_rd < new_mv_rd) {
+    for (idx = 0; idx < kMvPreCheckLines; ++idx) {
+      int r;
+      for (r = 0; r <= idx; ++r) {
+        int c = idx - r;
+        int nb_row = mi_row + r * mi_height;
+        int nb_col = mi_col + c * mi_width;
+        if (nb_row < tpl_frame->mi_rows && nb_col < tpl_frame->mi_cols) {
+          mv_mode_arr[nb_row * stride + nb_col] = tmp_mv_mode_arr[tmp_idx];
+          select_mv_arr[nb_row * stride + nb_col] = tmp_select_mv_arr[tmp_idx];
+          ++tmp_idx;
+        }
+      }
+    }
+    rd_diff_arr[mi_row * stride + mi_col] = 0;
+  } else {
+    rd_diff_arr[mi_row * stride + mi_col] =
+        (no_new_mv_rd - this_no_new_mv_rd) - (new_mv_rd - this_new_mv_rd);
+  }
+}
+
+static void predict_mv_mode_arr(VP9_COMP *cpi, MACROBLOCK *x,
+                                GF_PICTURE *gf_picture,
+                                MotionField *motion_field, int frame_idx,
+                                TplDepFrame *tpl_frame, int rf_idx,
+                                BLOCK_SIZE bsize) {
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int unit_rows = tpl_frame->mi_rows / mi_height;
+  const int unit_cols = tpl_frame->mi_cols / mi_width;
+  const int max_diagonal_lines = unit_rows + unit_cols - 1;
+  int idx;
+  for (idx = 0; idx < max_diagonal_lines; ++idx) {
+    int r;
+    for (r = VPXMAX(idx - unit_cols + 1, 0); r <= VPXMIN(idx, unit_rows - 1);
+         ++r) {
+      int c = idx - r;
+      int mi_row = r * mi_height;
+      int mi_col = c * mi_width;
+      assert(c >= 0 && c < unit_cols);
+      assert(mi_row >= 0 && mi_row < tpl_frame->mi_rows);
+      assert(mi_col >= 0 && mi_col < tpl_frame->mi_cols);
+      predict_mv_mode(cpi, x, gf_picture, motion_field, frame_idx, tpl_frame,
+                      rf_idx, bsize, mi_row, mi_col);
+    }
+  }
+}
+
+static void do_motion_search(VP9_COMP *cpi, ThreadData *td,
+                             MotionField *motion_field, int frame_idx,
+                             YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE bsize,
+                             int mi_row, int mi_col) {
+  VP9_COMMON *cm = &cpi->common;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  const int mb_y_offset =
+      mi_row * MI_SIZE * xd->cur_buf->y_stride + mi_col * MI_SIZE;
+  assert(ref_frame != NULL);
+  set_mv_limits(cm, x, mi_row, mi_col);
+  {
+    int_mv mv = vp9_motion_field_mi_get_mv(motion_field, mi_row, mi_col);
+    uint8_t *cur_frame_buf = xd->cur_buf->y_buffer + mb_y_offset;
+    uint8_t *ref_frame_buf = ref_frame->y_buffer + mb_y_offset;
+    const int stride = xd->cur_buf->y_stride;
+    full_pixel_motion_search(cpi, td, motion_field, frame_idx, cur_frame_buf,
+                             ref_frame_buf, stride, bsize, mi_row, mi_col,
+                             &mv.as_mv);
+    sub_pixel_motion_search(cpi, td, cur_frame_buf, ref_frame_buf, stride,
+                            bsize, &mv.as_mv);
+    vp9_motion_field_mi_set_mv(motion_field, mi_row, mi_col, mv);
+  }
+}
+
+static void build_motion_field(
+    VP9_COMP *cpi, int frame_idx,
+    YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES], BLOCK_SIZE bsize) {
+  VP9_COMMON *cm = &cpi->common;
+  ThreadData *td = &cpi->td;
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int pw = num_4x4_blocks_wide_lookup[bsize] << 2;
+  const int ph = num_4x4_blocks_high_lookup[bsize] << 2;
+  int mi_row, mi_col;
+  int rf_idx;
+
+  tpl_frame->lambda = (pw * ph) >> 2;
+  assert(pw * ph == tpl_frame->lambda << 2);
+
+  for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+    MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+        &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+    if (ref_frame[rf_idx] == NULL) {
+      continue;
+    }
+    vp9_motion_field_reset_mvs(motion_field);
+    for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+      for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+        do_motion_search(cpi, td, motion_field, frame_idx, ref_frame[rf_idx],
+                         bsize, mi_row, mi_col);
+      }
+    }
+  }
+}
+#endif  // CONFIG_NON_GREEDY_MV
+
+static void mc_flow_dispenser(VP9_COMP *cpi, GF_PICTURE *gf_picture,
+                              int frame_idx, BLOCK_SIZE bsize) {
+  TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+  VpxTplFrameStats *tpl_frame_stats_before_propagation =
+      &cpi->tpl_gop_stats.frame_stats_list[frame_idx];
+  YV12_BUFFER_CONFIG *this_frame = gf_picture[frame_idx].frame;
+  YV12_BUFFER_CONFIG *ref_frame[MAX_INTER_REF_FRAMES] = { NULL, NULL, NULL };
+
+  VP9_COMMON *cm = &cpi->common;
+  struct scale_factors sf;
+  int rdmult, idx;
+  ThreadData *td = &cpi->td;
+  MACROBLOCK *x = &td->mb;
+  MACROBLOCKD *xd = &x->e_mbd;
+  int mi_row, mi_col;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, predictor16[32 * 32 * 3]);
+  DECLARE_ALIGNED(16, uint8_t, predictor8[32 * 32 * 3]);
+  uint8_t *predictor;
+#else
+  DECLARE_ALIGNED(16, uint8_t, predictor[32 * 32 * 3]);
+#endif
+  DECLARE_ALIGNED(16, int16_t, src_diff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, coeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, qcoeff[32 * 32]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+
+  const TX_SIZE tx_size = max_txsize_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+
+  tpl_frame_stats_before_propagation->frame_width = cm->width;
+  tpl_frame_stats_before_propagation->frame_height = cm->height;
+  // Setup scaling factor
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height,
+      cpi->common.use_highbitdepth);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    predictor = CONVERT_TO_BYTEPTR(predictor16);
+  else
+    predictor = predictor8;
+#else
+  vp9_setup_scale_factors_for_frame(
+      &sf, this_frame->y_crop_width, this_frame->y_crop_height,
+      this_frame->y_crop_width, this_frame->y_crop_height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Prepare reference frame pointers. If any reference frame slot is
+  // unavailable, the pointer will be set to Null.
+  for (idx = 0; idx < MAX_INTER_REF_FRAMES; ++idx) {
+    int rf_idx = gf_picture[frame_idx].ref_frame[idx];
+    if (rf_idx != -1) ref_frame[idx] = gf_picture[rf_idx].frame;
+  }
+
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+  xd->cur_buf = this_frame;
+
+  // Get rd multiplier set up.
+  rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, tpl_frame->base_qindex);
+  set_error_per_bit(&cpi->td.mb, rdmult);
+  vp9_initialize_me_consts(cpi, &cpi->td.mb, tpl_frame->base_qindex);
+
+  tpl_frame->is_valid = 1;
+
+  cm->base_qindex = tpl_frame->base_qindex;
+  vp9_frame_init_quantizer(cpi);
+
+#if CONFIG_NON_GREEDY_MV
+  {
+    int square_block_idx;
+    int rf_idx;
+    for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
+         ++square_block_idx) {
+      BLOCK_SIZE square_bsize = square_block_idx_to_bsize(square_block_idx);
+      build_motion_field(cpi, frame_idx, ref_frame, square_bsize);
+    }
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      int ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        MotionField *motion_field = vp9_motion_field_info_get_motion_field(
+            &cpi->motion_field_info, frame_idx, rf_idx, bsize);
+        predict_mv_mode_arr(cpi, x, gf_picture, motion_field, frame_idx,
+                            tpl_frame, rf_idx, bsize);
+      }
+    }
+  }
+#endif  // CONFIG_NON_GREEDY_MV
+
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+      int64_t recon_error = 0;
+      int64_t rate_cost = 0;
+      int64_t sse = 0;
+      // Ref frame index in the ref frame buffer.
+      int ref_frame_idx = -1;
+      mode_estimation(cpi, x, xd, &sf, gf_picture, frame_idx, tpl_frame,
+                      src_diff, coeff, qcoeff, dqcoeff, mi_row, mi_col, bsize,
+                      tx_size, ref_frame, predictor, &recon_error, &rate_cost,
+                      &sse, &ref_frame_idx);
+      // Motion flow dependency dispenser.
+      tpl_model_store(tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize,
+                      tpl_frame->stride);
+
+      tpl_store_before_propagation(
+          tpl_frame_stats_before_propagation->block_stats_list,
+          tpl_frame->tpl_stats_ptr, mi_row, mi_col, bsize, tpl_frame->stride,
+          recon_error, rate_cost, ref_frame_idx);
+
+      tpl_model_update(cpi->tpl_stats, tpl_frame->tpl_stats_ptr, mi_row, mi_col,
+                       bsize);
+    }
+  }
+}
+
+static void trim_tpl_stats(struct vpx_internal_error_info *error_info,
+                           VpxTplGopStats *tpl_gop_stats, int extra_frames) {
+  int i;
+  VpxTplFrameStats *new_frame_stats;
+  const int new_size = tpl_gop_stats->size - extra_frames;
+  if (tpl_gop_stats->size <= extra_frames)
+    vpx_internal_error(
+        error_info, VPX_CODEC_ERROR,
+        "The number of frames in VpxTplGopStats is fewer than expected.");
+  CHECK_MEM_ERROR(error_info, new_frame_stats,
+                  vpx_calloc(new_size, sizeof(*new_frame_stats)));
+  for (i = 0; i < new_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].num_blocks = frame_stats->num_blocks;
+    new_frame_stats[i].frame_width = frame_stats->frame_width;
+    new_frame_stats[i].frame_height = frame_stats->frame_height;
+    new_frame_stats[i].num_blocks = num_blocks;
+    CHECK_MEM_ERROR(
+        error_info, new_frame_stats[i].block_stats_list,
+        vpx_calloc(num_blocks, sizeof(*new_frame_stats[i].block_stats_list)));
+    memcpy(new_frame_stats[i].block_stats_list, frame_stats->block_stats_list,
+           num_blocks * sizeof(*new_frame_stats[i].block_stats_list));
+  }
+  free_tpl_frame_stats_list(tpl_gop_stats);
+  tpl_gop_stats->size = new_size;
+  tpl_gop_stats->frame_stats_list = new_frame_stats;
+}
+
+#if CONFIG_NON_GREEDY_MV
+#define DUMP_TPL_STATS 0
+#if DUMP_TPL_STATS
+static void dump_buf(uint8_t *buf, int stride, int row, int col, int h, int w) {
+  int i, j;
+  printf("%d %d\n", h, w);
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      printf("%d ", buf[(row + i) * stride + col + j]);
+    }
+  }
+  printf("\n");
+}
+
+static void dump_frame_buf(const YV12_BUFFER_CONFIG *frame_buf) {
+  dump_buf(frame_buf->y_buffer, frame_buf->y_stride, 0, 0, frame_buf->y_height,
+           frame_buf->y_width);
+  dump_buf(frame_buf->u_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+  dump_buf(frame_buf->v_buffer, frame_buf->uv_stride, 0, 0,
+           frame_buf->uv_height, frame_buf->uv_width);
+}
+
+static void dump_tpl_stats(const VP9_COMP *cpi, int tpl_group_frames,
+                           const GF_GROUP *gf_group,
+                           const GF_PICTURE *gf_picture, BLOCK_SIZE bsize) {
+  int frame_idx;
+  const VP9_COMMON *cm = &cpi->common;
+  int rf_idx;
+  for (frame_idx = 1; frame_idx < tpl_group_frames; ++frame_idx) {
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      const TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+      int mi_row, mi_col;
+      int ref_frame_idx;
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      ref_frame_idx = gf_picture[frame_idx].ref_frame[rf_idx];
+      if (ref_frame_idx != -1) {
+        YV12_BUFFER_CONFIG *ref_frame_buf = gf_picture[ref_frame_idx].frame;
+        const int gf_frame_offset = gf_group->frame_gop_index[frame_idx];
+        const int ref_gf_frame_offset =
+            gf_group->frame_gop_index[ref_frame_idx];
+        printf("=\n");
+        printf(
+            "frame_idx %d mi_rows %d mi_cols %d bsize %d ref_frame_idx %d "
+            "rf_idx %d gf_frame_offset %d ref_gf_frame_offset %d\n",
+            frame_idx, cm->mi_rows, cm->mi_cols, mi_width * MI_SIZE,
+            ref_frame_idx, rf_idx, gf_frame_offset, ref_gf_frame_offset);
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              int_mv mv = vp9_motion_field_info_get_mv(&cpi->motion_field_info,
+                                                       frame_idx, rf_idx, bsize,
+                                                       mi_row, mi_col);
+              printf("%d %d %d %d\n", mi_row, mi_col, mv.as_mv.row,
+                     mv.as_mv.col);
+            }
+          }
+        }
+        for (mi_row = 0; mi_row < cm->mi_rows; ++mi_row) {
+          for (mi_col = 0; mi_col < cm->mi_cols; ++mi_col) {
+            if ((mi_row % mi_height) == 0 && (mi_col % mi_width) == 0) {
+              const TplDepStats *tpl_ptr =
+                  &tpl_frame
+                       ->tpl_stats_ptr[mi_row * tpl_frame->stride + mi_col];
+              printf("%f ", tpl_ptr->feature_score);
+            }
+          }
+        }
+        printf("\n");
+
+        for (mi_row = 0; mi_row < cm->mi_rows; mi_row += mi_height) {
+          for (mi_col = 0; mi_col < cm->mi_cols; mi_col += mi_width) {
+            const int mv_mode =
+                tpl_frame
+                    ->mv_mode_arr[rf_idx][mi_row * tpl_frame->stride + mi_col];
+            printf("%d ", mv_mode);
+          }
+        }
+        printf("\n");
+
+        dump_frame_buf(gf_picture[frame_idx].frame);
+        dump_frame_buf(ref_frame_buf);
+      }
+    }
+  }
+}
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi) {
+  VP9_COMMON *cm = &cpi->common;
+  int frame;
+
+  const int mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
+  const int mi_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+#if CONFIG_NON_GREEDY_MV
+  int rf_idx;
+
+  vpx_free(cpi->select_mv_arr);
+  CHECK_MEM_ERROR(
+      &cm->error, cpi->select_mv_arr,
+      vpx_calloc(mi_rows * mi_cols * 4, sizeof(*cpi->select_mv_arr)));
+#endif
+
+  // TODO(jingning): Reduce the actual memory use for tpl model build up.
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+    if (cpi->tpl_stats[frame].width >= mi_cols &&
+        cpi->tpl_stats[frame].height >= mi_rows &&
+        cpi->tpl_stats[frame].tpl_stats_ptr)
+      continue;
+
+#if CONFIG_NON_GREEDY_MV
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].mv_mode_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].mv_mode_arr[rf_idx])));
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+      CHECK_MEM_ERROR(
+          &cm->error, cpi->tpl_stats[frame].rd_diff_arr[rf_idx],
+          vpx_calloc(mi_rows * mi_cols * 4,
+                     sizeof(*cpi->tpl_stats[frame].rd_diff_arr[rf_idx])));
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    CHECK_MEM_ERROR(&cm->error, cpi->tpl_stats[frame].tpl_stats_ptr,
+                    vpx_calloc(mi_rows * mi_cols,
+                               sizeof(*cpi->tpl_stats[frame].tpl_stats_ptr)));
+    cpi->tpl_stats[frame].is_valid = 0;
+    cpi->tpl_stats[frame].width = mi_cols;
+    cpi->tpl_stats[frame].height = mi_rows;
+    cpi->tpl_stats[frame].stride = mi_cols;
+    cpi->tpl_stats[frame].mi_rows = cm->mi_rows;
+    cpi->tpl_stats[frame].mi_cols = cm->mi_cols;
+  }
+
+  for (frame = 0; frame < REF_FRAMES; ++frame) {
+    cpi->enc_frame_buf[frame].mem_valid = 0;
+    cpi->enc_frame_buf[frame].released = 1;
+  }
+}
+
+void vp9_free_tpl_buffer(VP9_COMP *cpi) {
+  int frame;
+#if CONFIG_NON_GREEDY_MV
+  vp9_free_motion_field_info(&cpi->motion_field_info);
+  vpx_free(cpi->select_mv_arr);
+#endif
+  for (frame = 0; frame < MAX_ARF_GOP_SIZE; ++frame) {
+#if CONFIG_NON_GREEDY_MV
+    int rf_idx;
+    for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
+      vpx_free(cpi->tpl_stats[frame].mv_mode_arr[rf_idx]);
+      vpx_free(cpi->tpl_stats[frame].rd_diff_arr[rf_idx]);
+    }
+#endif
+    vpx_free(cpi->tpl_stats[frame].tpl_stats_ptr);
+    cpi->tpl_stats[frame].is_valid = 0;
+  }
+  free_tpl_frame_stats_list(&cpi->tpl_gop_stats);
+}
+
+#if CONFIG_RATE_CTRL
+static void accumulate_frame_tpl_stats(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int show_frame_count = 0;
+  int frame_idx;
+  // Accumulate tpl stats for each frame in the current group of picture.
+  for (frame_idx = 1; frame_idx < gf_group->gf_group_size; ++frame_idx) {
+    TplDepFrame *tpl_frame = &cpi->tpl_stats[frame_idx];
+    TplDepStats *tpl_stats = tpl_frame->tpl_stats_ptr;
+    const int tpl_stride = tpl_frame->stride;
+    int64_t intra_cost_base = 0;
+    int64_t inter_cost_base = 0;
+    int64_t mc_dep_cost_base = 0;
+    int64_t mc_ref_cost_base = 0;
+    int64_t mc_flow_base = 0;
+    int row, col;
+
+    if (!tpl_frame->is_valid) continue;
+
+    for (row = 0; row < cm->mi_rows && tpl_frame->is_valid; ++row) {
+      for (col = 0; col < cm->mi_cols; ++col) {
+        TplDepStats *this_stats = &tpl_stats[row * tpl_stride + col];
+        intra_cost_base += this_stats->intra_cost;
+        inter_cost_base += this_stats->inter_cost;
+        mc_dep_cost_base += this_stats->mc_dep_cost;
+        mc_ref_cost_base += this_stats->mc_ref_cost;
+        mc_flow_base += this_stats->mc_flow;
+      }
+    }
+
+    cpi->tpl_stats_info[show_frame_count].intra_cost = intra_cost_base;
+    cpi->tpl_stats_info[show_frame_count].inter_cost = inter_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_dep_cost = mc_dep_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_ref_cost = mc_ref_cost_base;
+    cpi->tpl_stats_info[show_frame_count].mc_flow = mc_flow_base;
+
+    ++show_frame_count;
+  }
+}
+#endif  // CONFIG_RATE_CTRL
+
+void vp9_setup_tpl_stats(VP9_COMP *cpi) {
+  GF_PICTURE gf_picture[MAX_ARF_GOP_SIZE];
+  const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+  int tpl_group_frames = 0;
+  int frame_idx;
+  int extended_frame_count;
+  cpi->tpl_bsize = BLOCK_32X32;
+
+  extended_frame_count =
+      init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
+
+  init_tpl_stats(cpi);
+
+  init_tpl_stats_before_propagation(&cpi->common.error, &cpi->tpl_gop_stats,
+                                    cpi->tpl_stats, tpl_group_frames,
+                                    cpi->common.width, cpi->common.height);
+
+  // Backward propagation from tpl_group_frames to 1.
+  for (frame_idx = tpl_group_frames - 1; frame_idx > 0; --frame_idx) {
+    if (gf_picture[frame_idx].update_type == USE_BUF_FRAME) continue;
+    mc_flow_dispenser(cpi, gf_picture, frame_idx, cpi->tpl_bsize);
+  }
+
+  // TPL stats has extra frames from next GOP. Trim those extra frames for
+  // Qmode.
+  trim_tpl_stats(&cpi->common.error, &cpi->tpl_gop_stats, extended_frame_count);
+
+  if (cpi->ext_ratectrl.ready &&
+      cpi->ext_ratectrl.funcs.send_tpl_gop_stats != NULL) {
+    const vpx_codec_err_t codec_status =
+        vp9_extrc_send_tpl_stats(&cpi->ext_ratectrl, &cpi->tpl_gop_stats);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cpi->common.error, codec_status,
+                         "vp9_extrc_send_tpl_stats() failed");
+    }
+  }
+
+#if CONFIG_NON_GREEDY_MV
+  cpi->tpl_ready = 1;
+#if DUMP_TPL_STATS
+  dump_tpl_stats(cpi, tpl_group_frames, gf_group, gf_picture, cpi->tpl_bsize);
+#endif  // DUMP_TPL_STATS
+#endif  // CONFIG_NON_GREEDY_MV
+
+#if CONFIG_RATE_CTRL
+  if (cpi->oxcf.use_simple_encode_api) {
+    accumulate_frame_tpl_stats(cpi);
+  }
+#endif  // CONFIG_RATE_CTRL
+}
diff --git a/vp9/encoder/vp9_tpl_model.h b/vp9/encoder/vp9_tpl_model.h
new file mode 100644
index 000000000..04beb2261
--- /dev/null
+++ b/vp9/encoder/vp9_tpl_model.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+#define VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef M_LOG2_E
+#define M_LOG2_E 0.693147180559945309417
+#endif
+#define log2f(x) (log(x) / (float)M_LOG2_E)
+
+#define TPL_DEP_COST_SCALE_LOG2 4
+
+typedef struct GF_PICTURE {
+  YV12_BUFFER_CONFIG *frame;
+  int ref_frame[3];
+  FRAME_UPDATE_TYPE update_type;
+} GF_PICTURE;
+
+void vp9_init_tpl_buffer(VP9_COMP *cpi);
+void vp9_setup_tpl_stats(VP9_COMP *cpi);
+void vp9_free_tpl_buffer(VP9_COMP *cpi);
+
+void vp9_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                      TX_SIZE tx_size);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_highbd_wht_fwd_txfm(int16_t *src_diff, int bw, tran_low_t *coeff,
+                             TX_SIZE tx_size);
+#endif
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VP9_ENCODER_VP9_TPL_MODEL_H_
diff --git a/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index a7f5117cf..97f182c66 100644
--- a/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Compute (a-b)**2 for 8 pixels with size 16-bit
 static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
@@ -141,11 +141,12 @@ static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
   count_u16 = _mm_adds_epu16(count_u16, sum_u16);
   _mm_storeu_si128((__m128i *)count, count_u16);
 
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
   pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
   pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
 
+  pred_0_u32 = _mm_mullo_epi32(sum_first_u32, pred_0_u32);
+  pred_1_u32 = _mm_mullo_epi32(sum_second_u32, pred_1_u32);
+
   accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
   accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
 
diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c
index 87e68fb43..7571bfcca 100644
--- a/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/vp9/encoder/x86/temporal_filter_sse4.c
@@ -16,7 +16,7 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_temporal_filter.h"
-#include "vp9/encoder/x86/temporal_filter_constants.h"
+#include "vp9/encoder/vp9_temporal_filter_constants.h"
 
 // Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
 // difference squared, and store as unsigned 16-bit integer to dst.
diff --git a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
deleted file mode 100644
index 0e04a2f41..000000000
--- a/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ /dev/null
@@ -1,317 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
-#include <emmintrin.h>
-#include <smmintrin.h>
-
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __GNUC__
-#define LIKELY(v) __builtin_expect(v, 1)
-#define UNLIKELY(v) __builtin_expect(v, 0)
-#else
-#define LIKELY(v) (v)
-#define UNLIKELY(v) (v)
-#endif
-
-static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
-  int_mv result;
-  result.as_mv.row = row;
-  result.as_mv.col = col;
-  return result;
-}
-
-static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
-  // This is simplified from the C implementation to utilise that
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
-  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
-  return mv.as_int == 0 ? 0 : 1;
-}
-
-static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
-                          int *const comp_cost[2]) {
-  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
-         comp_cost[1][mv.as_mv.col];
-}
-
-static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
-                          int sad_per_bit) {
-  const int_mv diff =
-      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
-  return ROUND_POWER_OF_TWO(
-      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
-      VP9_PROB_COST_SHIFT);
-}
-
-/*****************************************************************************
- * This function utilizes 3 properties of the cost function lookup tables,   *
- * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
- * vp9_encoder.c.                                                            *
- * For the joint cost:                                                       *
- *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
- * For the component costs:                                                  *
- *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
- *         (Equal costs for both components)                                 *
- *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
- *         (Cost function is even)                                           *
- * If these do not hold, then this function cannot be used without           *
- * modification, in which case you can revert to using the C implementation, *
- * which does not rely on these properties.                                  *
- *****************************************************************************/
-int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
-                               const search_site_config *cfg, MV *ref_mv,
-                               MV *best_mv, int search_param, int sad_per_bit,
-                               int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
-                               const MV *center_mv) {
-  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
-  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
-
-  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
-
-  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
-  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);
-
-  // search_param determines the length of the initial step and hence the number
-  // of iterations.
-  // 0 = initial step (MAX_FIRST_STEP) pel
-  // 1 = (MAX_FIRST_STEP/2) pel,
-  // 2 = (MAX_FIRST_STEP/4) pel...
-  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
-  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
-  const int tot_steps = cfg->total_steps - search_param;
-
-  const int_mv fcenter_mv =
-      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
-
-  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
-  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
-
-  int_mv bmv = pack_int_mv(ref_row, ref_col);
-  int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-
-  const int what_stride = x->plane[0].src.stride;
-  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const uint8_t *const in_what =
-      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
-
-  // Work out the start point for the search
-  const uint8_t *best_address = in_what;
-  const uint8_t *new_best_address = best_address;
-#if VPX_ARCH_X86_64
-  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-  unsigned int best_sad;
-  int i, j, step;
-
-  // Check the prerequisite cost function properties that are easy to check
-  // in an assert. See the function-level documentation for details on all
-  // prerequisites.
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
-  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
-
-  // Check the starting position
-  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
-  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
-
-  *num00 = 0;
-
-  for (i = 0, step = 0; step < tot_steps; step++) {
-    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
-      __m128i v_sad_d, v_cost_d, v_outside_d, v_inside_d, v_diff_mv_w;
-#if VPX_ARCH_X86_64
-      __m128i v_blocka[2];
-#else
-      __m128i v_blocka[1];
-#endif
-
-      // Compute the candidate motion vectors
-      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i *)&ss_mv[i]);
-      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
-      // Clamp them to the search bounds
-      __m128i v_these_mv_clamp_w = v_these_mv_w;
-      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
-      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
-      // The ones that did not change are inside the search area
-      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);
-
-      // If none of them are inside, then move on
-      if (LIKELY(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
-        continue;
-      }
-
-      // The inverse mask indicates which of the MVs are outside
-      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8((int8_t)0xff));
-      // Shift right to keep the sign bit clear, we will use this later
-      // to set the cost to the maximum value.
-      v_outside_d = _mm_srli_epi32(v_outside_d, 1);
-
-      // Compute the difference MV
-      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
-      // We utilise the fact that the cost function is even, and use the
-      // absolute difference. This allows us to use unsigned indexes later
-      // and reduces cache pressure somewhat as only a half of the table
-      // is ever referenced.
-      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);
-
-      // Compute the SIMD pointer offsets.
-      {
-#if VPX_ARCH_X86_64  //  sizeof(intptr_t) == 8
-        // Load the offsets
-        __m128i v_bo10_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 0]);
-        __m128i v_bo32_q = _mm_loadu_si128((const __m128i *)&ss_os[i + 2]);
-        // Set the ones falling outside to zero
-        v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d));
-        v_bo32_q =
-            _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d));
-        // Compute the candidate addresses
-        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
-        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
-#else  // VPX_ARCH_X86 //  sizeof(intptr_t) == 4
-        __m128i v_bo_d = _mm_loadu_si128((const __m128i *)&ss_os[i]);
-        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
-        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
-#endif
-      }
-
-      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
-                     in_what_stride, (uint32_t *)&v_sad_d);
-
-      // Look up the component cost of the residual motion vector
-      {
-        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
-        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
-        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
-        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
-        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
-        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
-        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
-        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);
-
-        // Note: This is a use case for vpgather in AVX2
-        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
-        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
-        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
-        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];
-
-        __m128i v_cost_10_d, v_cost_32_d;
-        v_cost_10_d = _mm_cvtsi32_si128(cost0);
-        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);
-        v_cost_32_d = _mm_cvtsi32_si128(cost2);
-        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);
-        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
-      }
-
-      // Now add in the joint cost
-      {
-        const __m128i v_sel_d =
-            _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128());
-        const __m128i v_joint_cost_d =
-            _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d);
-        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
-      }
-
-      // Multiply by sad_per_bit
-      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
-      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
-      v_cost_d = _mm_add_epi32(v_cost_d,
-                               _mm_set1_epi32(1 << (VP9_PROB_COST_SHIFT - 1)));
-      v_cost_d = _mm_srai_epi32(v_cost_d, VP9_PROB_COST_SHIFT);
-      // Add the cost to the sad
-      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);
-
-      // Make the motion vectors outside the search area have max cost
-      // by or'ing in the comparison mask, this way the minimum search won't
-      // pick them.
-      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);
-
-      // Find the minimum value and index horizontally in v_sad_d
-      {
-        // Try speculatively on 16 bits, so we can use the minpos intrinsic
-        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
-        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);
-
-        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
-        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);
-
-        // If the local best value is not saturated, just use it, otherwise
-        // find the horizontal minimum again the hard way on 32 bits.
-        // This is executed rarely.
-        if (UNLIKELY(local_best_sad == 0xffff)) {
-          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;
-
-          v_loval_d = v_sad_d;
-          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
-          v_hival_d = _mm_srli_si128(v_loval_d, 8);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-          v_hival_d = _mm_srli_si128(v_loval_d, 4);
-          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);
-
-          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);
-
-          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
-          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
-
-          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
-          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
-        }
-
-        // Update the global minimum if the local minimum is smaller
-        if (LIKELY(local_best_sad < best_sad)) {
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
-#endif
-          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
-#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
-#pragma GCC diagnostic pop
-#endif
-          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
-
-          best_sad = local_best_sad;
-        }
-      }
-    }
-
-    bmv = new_bmv;
-    best_address = new_best_address;
-
-    v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
-#if VPX_ARCH_X86_64
-    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
-#else
-    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
-#endif
-
-    if (UNLIKELY(best_address == in_what)) {
-      (*num00)++;
-    }
-  }
-
-  *best_mv = bmv.as_mv;
-  return best_sad;
-}
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index bf0e8b121..94506aad0 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -469,18 +469,18 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
   // It's used to choose the src offset and filter coefficient offset.
   const int offset_idx1 = (offset1_q4 >> 4) & 1;
   const int offset_idx2 = (offset2_q4 >> 4) & 1;
-  static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+  static const shuffle_filter_funcs kShuffleFilterFuncs[2] = {
     shuffle_filter_ssse3, shuffle_filter_odd_ssse3
   };
-  static const convolve8_funcs convolve8_funcs[2] = {
+  static const convolve8_funcs kConvolve8Funcs[2] = {
     convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
   };
 
   assert(w && h);
 
   shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
-  shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
-  shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+  kShuffleFilterFuncs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+  kShuffleFilterFuncs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
 
   // Sub 64 to avoid overflow.
   // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
@@ -522,11 +522,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       // 04 14 24 34 44 54 64 74
       // 05 15 25 35 45 55 65 75
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 10 20 30 40 50 60 70  02 12 22 32 42 52 62 72
       // 01 11 21 31 41 51 61 71  03 13 23 33 43 53 63 73
@@ -598,11 +598,11 @@ static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
       loadu_8bit_16x4(t, stride_hor, &s[4]);
 
       d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
-      d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
-      d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+      d[1] = kConvolve8Funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+      d[2] = kConvolve8Funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
       d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
-      d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
-      d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+      d[4] = kConvolve8Funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+      d[5] = kConvolve8Funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
 
       // 00 01 02 03 04 05 06 07  10 11 12 13 14 15 16 17
       // 20 21 22 23 24 25 26 27  30 31 32 33 34 35 36 37
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
index da285be8e..bf44b0867 100644
--- a/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -16,6 +16,8 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 // Zero fill 8 positions in the output buffer.
 static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
@@ -29,11 +31,13 @@ static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
 }
 
 static VPX_FORCE_INLINE void load_fp_values_avx2(
-    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
-    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
-  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->round_fp));
   *round = _mm256_permute4x64_epi64(*round, 0x54);
-  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_fp));
   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
   *dequant =
       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
@@ -98,13 +102,13 @@ static VPX_FORCE_INLINE void quantize_fp_16(
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -113,8 +117,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                      &dequant);
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_setzero_si256();
 
   quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
@@ -203,14 +206,13 @@ static VPX_FORCE_INLINE void quantize_fp_32x32_16(
 }
 
 void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -219,8 +221,7 @@ void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                      &dequant);
+  load_fp_values_avx2(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_srli_epi16(dequant, 2);
   quant = _mm256_slli_epi16(quant, 1);
   {
@@ -286,16 +287,17 @@ static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
 }
 
 static VPX_FORCE_INLINE void highbd_load_fp_values(
-    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
-    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
-  *round = highbd_init_256(round_ptr);
-  *quant = highbd_init_256(quant_ptr);
+    const struct macroblock_plane *mb_plane, __m256i *round, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(mb_plane->round_fp);
+  *quant = highbd_init_256(mb_plane->quant_fp);
   *dequant = highbd_init_256(dequant_ptr);
 }
 
 static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
     const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
-  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask =
+      _mm256_packs_epi32(nz_mask, _mm256_setzero_si256());
   const __m256i packed_nz_mask_perm =
       _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
   const __m256i iscan =
@@ -324,16 +326,15 @@ static VPX_FORCE_INLINE void highbd_quantize_fp(
 }
 
 void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *const mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i round, quant, dequant;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -342,8 +343,7 @@ void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                        &dequant);
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
                      iscan + n_coeffs, qcoeff_ptr + n_coeffs,
@@ -390,14 +390,14 @@ static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
 }
 
 void vp9_highbd_quantize_fp_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
-    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
-    const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+    const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i round, quant, dequant, thr;
   __m256i eob_max = _mm256_setzero_si256();
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
@@ -406,8 +406,7 @@ void vp9_highbd_quantize_fp_32x32_avx2(
   n_coeffs = -n_coeffs;
 
   // Setup global values
-  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
-                        &dequant);
+  highbd_load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
   thr = _mm256_srli_epi32(dequant, 2);
   // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
   // calculating the zbin mask.
diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index c87723443..2481eb366 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -17,12 +17,14 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vp9/encoder/x86/vp9_quantize_ssse3.c b/vp9/encoder/x86/vp9_quantize_ssse3.c
index d35004e37..98decae74 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3.c
+++ b/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -17,12 +17,14 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           const struct macroblock_plane *const mb_plane,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                           const int16_t *scan, const int16_t *iscan) {
+                           const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
@@ -31,11 +33,10 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -119,12 +120,11 @@ void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 
 void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                 const int16_t *round_ptr,
-                                 const int16_t *quant_ptr,
+                                 const struct macroblock_plane *const mb_plane,
                                  tran_low_t *qcoeff_ptr,
                                  tran_low_t *dqcoeff_ptr,
                                  const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                 const int16_t *scan, const int16_t *iscan) {
+                                 const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i one_s16 = _mm_set1_epi16(1);
   __m128i thr;
@@ -134,11 +134,10 @@ void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i coeff0, coeff1;
   __m128i qcoeff0, qcoeff1;
   __m128i eob;
-
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
   // Setup global values.
-  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  load_fp_values(mb_plane, &round, &quant, dequant_ptr, &dequant);
   // The 32x32 halves round.
   round = _mm_add_epi16(round, one_s16);
   round = _mm_srli_epi16(round, 1);
diff --git a/vp9/ratectrl_rtc.cc b/vp9/ratectrl_rtc.cc
index 02e50a857..fd81bce7b 100644
--- a/vp9/ratectrl_rtc.cc
+++ b/vp9/ratectrl_rtc.cc
@@ -25,22 +25,16 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (!rc_api->cpi_) {
-    rc_api.reset();
-    return nullptr;
-  }
+  if (!rc_api->cpi_) return nullptr;
   vp9_zero(*rc_api->cpi_);
 
-  rc_api->InitRateControl(cfg);
+  if (!rc_api->InitRateControl(cfg)) return nullptr;
   if (cfg.aq_mode) {
     VP9_COMP *const cpi = rc_api->cpi_;
     cpi->segmentation_map = static_cast<uint8_t *>(
         vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                    sizeof(*cpi->segmentation_map)));
-    if (!cpi->segmentation_map) {
-      rc_api.reset();
-      return nullptr;
-    }
+    if (!cpi->segmentation_map) return nullptr;
     cpi->cyclic_refresh =
         vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
     cpi->cyclic_refresh->content_mode = 0;
@@ -48,7 +42,30 @@ std::unique_ptr<VP9RateControlRTC> VP9RateControlRTC::Create(
   return rc_api;
 }
 
-void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
+VP9RateControlRTC::~VP9RateControlRTC() {
+  if (cpi_) {
+    if (cpi_->svc.number_spatial_layers > 1 ||
+        cpi_->svc.number_temporal_layers > 1) {
+      for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
+        for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
+          int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
+          LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
+          vpx_free(lc->map);
+          vpx_free(lc->last_coded_q_map);
+          vpx_free(lc->consec_zero_mv);
+        }
+      }
+    }
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      vpx_free(cpi_->segmentation_map);
+      cpi_->segmentation_map = NULL;
+      vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
+    }
+    vpx_free(cpi_);
+  }
+}
+
+bool VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
@@ -65,7 +82,7 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   cm->current_video_frame = 0;
   rc->kf_boost = DEFAULT_KF_BOOST;
 
-  UpdateRateControl(rc_cfg);
+  if (!UpdateRateControl(rc_cfg)) return false;
   vp9_set_mb_mi(cm, cm->width, cm->height);
 
   cpi_->use_svc = (cpi_->svc.number_spatial_layers > 1 ||
@@ -79,10 +96,21 @@ void VP9RateControlRTC::InitRateControl(const VP9RateControlRtcConfig &rc_cfg) {
   vp9_rc_init(oxcf, 0, rc);
   rc->constrain_gf_key_freq_onepass_vbr = 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
+  return true;
 }
 
-void VP9RateControlRTC::UpdateRateControl(
+bool VP9RateControlRTC::UpdateRateControl(
     const VP9RateControlRtcConfig &rc_cfg) {
+  // Since VPX_MAX_LAYERS (12) is less than the product of VPX_SS_MAX_LAYERS (5)
+  // and VPX_TS_MAX_LAYERS (5), check all three.
+  if (rc_cfg.ss_number_layers < 1 ||
+      rc_cfg.ss_number_layers > VPX_SS_MAX_LAYERS ||
+      rc_cfg.ts_number_layers < 1 ||
+      rc_cfg.ts_number_layers > VPX_TS_MAX_LAYERS ||
+      rc_cfg.ss_number_layers * rc_cfg.ts_number_layers > VPX_MAX_LAYERS) {
+    return false;
+  }
+
   VP9_COMMON *cm = &cpi_->common;
   VP9EncoderConfig *oxcf = &cpi_->oxcf;
   RATE_CONTROL *const rc = &cpi_->rc;
@@ -102,6 +130,8 @@ void VP9RateControlRTC::UpdateRateControl(
   oxcf->maximum_buffer_size_ms = rc_cfg.buf_sz;
   oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
   oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  oxcf->drop_frames_water_mark = rc_cfg.frame_drop_thresh;
+  oxcf->content = rc_cfg.is_screen ? VP9E_CONTENT_SCREEN : VP9E_CONTENT_DEFAULT;
   oxcf->ss_number_layers = rc_cfg.ss_number_layers;
   oxcf->ts_number_layers = rc_cfg.ts_number_layers;
   oxcf->temporal_layering_mode = (VP9E_TEMPORAL_LAYERING_MODE)(
@@ -112,7 +142,19 @@ void VP9RateControlRTC::UpdateRateControl(
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
+
   vp9_set_mb_mi(cm, cm->width, cm->height);
+
+  if (setjmp(cpi_->common.error.jmp)) {
+    cpi_->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    return false;
+  }
+  cpi_->common.error.setjmp = 1;
+
+  for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
+    oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
+  }
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =
@@ -126,21 +168,33 @@ void VP9RateControlRTC::UpdateRateControl(
       lrc->best_quality = vp9_quantizer_to_qindex(rc_cfg.min_quantizers[layer]);
       lc->scaling_factor_num = rc_cfg.scaling_factor_num[sl];
       lc->scaling_factor_den = rc_cfg.scaling_factor_den[sl];
-      oxcf->ts_rate_decimator[tl] = rc_cfg.ts_rate_decimator[tl];
     }
   }
   vp9_set_rc_buffer_sizes(cpi_);
   vp9_new_framerate(cpi_, cpi_->framerate);
   if (cpi_->svc.number_temporal_layers > 1 ||
       cpi_->svc.number_spatial_layers > 1) {
-    if (cm->current_video_frame == 0) vp9_init_layer_context(cpi_);
+    if (cm->current_video_frame == 0) {
+      vp9_init_layer_context(cpi_);
+      // svc->framedrop_mode is not currently exposed, so only allow for
+      // full superframe drop for now.
+      cpi_->svc.framedrop_mode = FULL_SUPERFRAME_DROP;
+    }
     vp9_update_layer_context_change_config(cpi_,
                                            (int)cpi_->oxcf.target_bandwidth);
+    cpi_->svc.max_consec_drop = rc_cfg.max_consec_drop;
   }
   vp9_check_reset_rc_flag(cpi_);
+
+  cpi_->common.error.setjmp = 0;
+  return true;
 }
 
-void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
+// Compute the QP for the frame. If the frame is dropped this function
+// returns kDrop, and no QP is computed. If the frame is encoded (not dropped)
+// the QP is computed and kOk is returned.
+FrameDropDecision VP9RateControlRTC::ComputeQP(
+    const VP9FrameParamsQpRTC &frame_params) {
   VP9_COMMON *const cm = &cpi_->common;
   int width, height;
   cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
@@ -157,7 +211,7 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     cm->height = height;
   }
   vp9_set_mb_mi(cm, cm->width, cm->height);
-  cm->frame_type = frame_params.frame_type;
+  cm->frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
   // This is needed to ensure key frame does not get unset in rc_get_svc_params.
   cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
   cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
@@ -192,11 +246,51 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
     vp9_restore_layer_context(cpi_);
     vp9_rc_get_svc_params(cpi_);
   }
+  if (cpi_->svc.spatial_layer_id == 0) vp9_zero(cpi_->svc.drop_spatial_layer);
+  // SVC: check for skip encoding of enhancement layer if the
+  // layer target bandwidth = 0.
+  if (vp9_svc_check_skip_enhancement_layer(cpi_))
+    return FrameDropDecision::kDrop;
+  // Check for dropping this frame based on buffer level.
+  // Never drop on key frame, or if base layer is key for svc,
+  if (!frame_is_intra_only(cm) &&
+      (!cpi_->use_svc ||
+       !cpi_->svc.layer_context[cpi_->svc.temporal_layer_id].is_key_frame)) {
+    if (vp9_rc_drop_frame(cpi_)) {
+      // For FULL_SUPERFRAME_DROP mode (the only mode considered here):
+      // if the superframe drop is decided we need to save the layer context for
+      // all spatial layers, and call update_buffer_level and postencode_drop
+      // for all spatial layers.
+      if (cpi_->svc.number_spatial_layers > 1 ||
+          cpi_->svc.number_temporal_layers > 1) {
+        vp9_save_layer_context(cpi_);
+        for (int sl = 1; sl < cpi_->svc.number_spatial_layers; sl++) {
+          cpi_->svc.spatial_layer_id = sl;
+          vp9_restore_layer_context(cpi_);
+          vp9_update_buffer_level_svc_preencode(cpi_);
+          vp9_rc_postencode_update_drop_frame(cpi_);
+          vp9_save_layer_context(cpi_);
+        }
+      }
+      return FrameDropDecision::kDrop;
+    }
+  }
+  // Compute the QP for the frame.
   int bottom_index, top_index;
   cpi_->common.base_qindex =
       vp9_rc_pick_q_and_bounds(cpi_, &bottom_index, &top_index);
 
   if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) vp9_cyclic_refresh_setup(cpi_);
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1)
+    vp9_save_layer_context(cpi_);
+
+  cpi_->last_frame_dropped = 0;
+  cpi_->svc.last_layer_dropped[cpi_->svc.spatial_layer_id] = 0;
+  if (cpi_->svc.spatial_layer_id == cpi_->svc.number_spatial_layers - 1)
+    cpi_->svc.num_encoded_top_layer++;
+
+  return FrameDropDecision::kOk;
 }
 
 int VP9RateControlRTC::GetQP() const { return cpi_->common.base_qindex; }
@@ -219,7 +313,31 @@ bool VP9RateControlRTC::GetSegmentationData(
   return true;
 }
 
-void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+void VP9RateControlRTC::PostEncodeUpdate(
+    uint64_t encoded_frame_size, const VP9FrameParamsQpRTC &frame_params) {
+  cpi_->common.frame_type = static_cast<FRAME_TYPE>(frame_params.frame_type);
+  cpi_->svc.spatial_layer_id = frame_params.spatial_layer_id;
+  cpi_->svc.temporal_layer_id = frame_params.temporal_layer_id;
+  if (cpi_->svc.number_spatial_layers > 1 ||
+      cpi_->svc.number_temporal_layers > 1) {
+    vp9_restore_layer_context(cpi_);
+    const int layer = LAYER_IDS_TO_IDX(cpi_->svc.spatial_layer_id,
+                                       cpi_->svc.temporal_layer_id,
+                                       cpi_->svc.number_temporal_layers);
+    LAYER_CONTEXT *lc = &cpi_->svc.layer_context[layer];
+    cpi_->common.base_qindex = lc->frame_qp;
+    cpi_->common.MBs = lc->MBs;
+    // For spatial-svc, allow cyclic-refresh to be applied on the spatial
+    // layers, for the base temporal layer.
+    if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+        cpi_->svc.number_spatial_layers > 1 &&
+        cpi_->svc.temporal_layer_id == 0) {
+      CYCLIC_REFRESH *const cr = cpi_->cyclic_refresh;
+      cr->qindex_delta[0] = lc->qindex_delta[0];
+      cr->qindex_delta[1] = lc->qindex_delta[1];
+      cr->qindex_delta[2] = lc->qindex_delta[2];
+    }
+  }
   vp9_rc_postencode_update(cpi_, encoded_frame_size);
   if (cpi_->svc.number_spatial_layers > 1 ||
       cpi_->svc.number_temporal_layers > 1)
diff --git a/vp9/ratectrl_rtc.h b/vp9/ratectrl_rtc.h
index b209e4db6..85005c547 100644
--- a/vp9/ratectrl_rtc.h
+++ b/vp9/ratectrl_rtc.h
@@ -14,22 +14,20 @@
 #include <cstdint>
 #include <memory>
 
-#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_enums.h"
-#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/vp9_iface_common.h"
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
-#include "vp9/encoder/vp9_encoder.h"
-#include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
 #include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
-namespace libvpx {
+struct VP9_COMP;
 
+namespace libvpx {
 struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP9RateControlRtcConfig() {
+    ss_number_layers = 1;
     vp9_zero(max_quantizers);
     vp9_zero(min_quantizers);
     vp9_zero(scaling_factor_den);
@@ -40,20 +38,21 @@ struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
     scaling_factor_den[0] = 1;
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
+    max_consec_drop = INT_MAX;
   }
 
   // Number of spatial layers
   int ss_number_layers;
-  // Number of temporal layers
-  int ts_number_layers;
   int max_quantizers[VPX_MAX_LAYERS];
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
+  // This is only for SVC for now.
+  int max_consec_drop;
 };
 
 struct VP9FrameParamsQpRTC {
-  FRAME_TYPE frame_type;
+  RcFrameType frame_type;
   int spatial_layer_id;
   int temporal_layer_id;
 };
@@ -69,63 +68,46 @@ struct VP9SegmentationData {
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
-// VP9RateControlRTC rc_api;
 // VP9RateControlRtcConfig cfg;
 // VP9FrameParamsQpRTC frame_params;
 //
 // YourFunctionToInitializeConfig(cfg);
-// rc_api.InitRateControl(cfg);
+// std::unique_ptr<VP9RateControlRTC> rc_api = VP9RateControlRTC::Create(cfg);
 // // start encoding
 // while (frame_to_encode) {
 //   if (config_changed)
-//     rc_api.UpdateRateControl(cfg);
+//     rc_api->UpdateRateControl(cfg);
 //   YourFunctionToFillFrameParams(frame_params);
-//   rc_api.ComputeQP(frame_params);
-//   YourFunctionToUseQP(rc_api.GetQP());
-//   YourFunctionToUseLoopfilter(rc_api.GetLoopfilterLevel());
+//   rc_api->ComputeQP(frame_params);
+//   YourFunctionToUseQP(rc_api->GetQP());
+//   YourFunctionToUseLoopfilter(rc_api->GetLoopfilterLevel());
 //   // After encoding
-//   rc_api.PostEncode(encoded_frame_size);
+//   rc_api->PostEncode(encoded_frame_size, frame_params);
 // }
 class VP9RateControlRTC {
  public:
   static std::unique_ptr<VP9RateControlRTC> Create(
       const VP9RateControlRtcConfig &cfg);
-  ~VP9RateControlRTC() {
-    if (cpi_) {
-      if (cpi_->svc.number_spatial_layers > 1 ||
-          cpi_->svc.number_temporal_layers > 1) {
-        for (int sl = 0; sl < cpi_->svc.number_spatial_layers; sl++) {
-          for (int tl = 0; tl < cpi_->svc.number_temporal_layers; tl++) {
-            int layer = LAYER_IDS_TO_IDX(sl, tl, cpi_->oxcf.ts_number_layers);
-            LAYER_CONTEXT *const lc = &cpi_->svc.layer_context[layer];
-            vpx_free(lc->map);
-            vpx_free(lc->last_coded_q_map);
-            vpx_free(lc->consec_zero_mv);
-          }
-        }
-      }
-      if (cpi_->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
-        vpx_free(cpi_->segmentation_map);
-        cpi_->segmentation_map = NULL;
-        vp9_cyclic_refresh_free(cpi_->cyclic_refresh);
-      }
-      vpx_free(cpi_);
-    }
-  }
+  ~VP9RateControlRTC();
 
-  void UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
+  bool UpdateRateControl(const VP9RateControlRtcConfig &rc_cfg);
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
   bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
-  void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
+  // ComputeQP computes the QP if the frame is not dropped (kOk return),
+  // otherwise it returns kDrop and subsequent GetQP and PostEncodeUpdate
+  // are not to be called (vp9_rc_postencode_update_drop_frame is already
+  // called via ComputeQP if drop is decided).
+  FrameDropDecision ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
-  void PostEncodeUpdate(uint64_t encoded_frame_size);
+  void PostEncodeUpdate(uint64_t encoded_frame_size,
+                        const VP9FrameParamsQpRTC &frame_params);
 
  private:
   VP9RateControlRTC() {}
-  void InitRateControl(const VP9RateControlRtcConfig &cfg);
-  VP9_COMP *cpi_;
+  bool InitRateControl(const VP9RateControlRtcConfig &cfg);
+  struct VP9_COMP *cpi_;
 };
 
 }  // namespace libvpx
diff --git a/vp9/simple_encode.cc b/vp9/simple_encode.cc
index f42912d35..2e6f9a451 100644
--- a/vp9/simple_encode.cc
+++ b/vp9/simple_encode.cc
@@ -143,7 +143,6 @@ get_frame_type_from_update_type(FRAME_UPDATE_TYPE update_type) {
     default:
       fprintf(stderr, "Unsupported update_type %d\n", update_type);
       abort();
-      return kFrameTypeInter;
   }
 }
 
@@ -183,10 +182,11 @@ static void update_motion_vector_info(
     const MV_REFERENCE_FRAME *in_ref_frame =
         input_motion_vector_info[i].ref_frame;
     output_motion_vector_info[i].mv_count =
-        (in_ref_frame[0] == INTRA_FRAME) ? 0
-                                         : ((in_ref_frame[1] == NONE) ? 1 : 2);
-    if (in_ref_frame[0] == NONE) {
-      fprintf(stderr, "in_ref_frame[0] shouldn't be NONE\n");
+        (in_ref_frame[0] == INTRA_FRAME)
+            ? 0
+            : ((in_ref_frame[1] == NO_REF_FRAME) ? 1 : 2);
+    if (in_ref_frame[0] == NO_REF_FRAME) {
+      fprintf(stderr, "in_ref_frame[0] shouldn't be NO_REF_FRAME\n");
       abort();
     }
     output_motion_vector_info[i].ref_frame[0] =
diff --git a/vp9/simple_encode.h b/vp9/simple_encode.h
index 7920e95ee..d610a5e15 100644
--- a/vp9/simple_encode.h
+++ b/vp9/simple_encode.h
@@ -309,7 +309,7 @@ struct EncodeFrameResult {
   // The tpl stats stored in the vector is according to the encoding order.
   // For example, suppose there are N show frames for the current GOP.
   // Then tpl_stats_info[0] stores the information of the first frame to be
-  // encoded for this GOP, i.e, the AltRef frame.
+  // encoded for this GOP, i.e., the AltRef frame.
   std::vector<TplStatsInfo> tpl_stats_info;
   ImageBuffer coded_frame;
 
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index dee175dc0..e738feda0 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
 #include "vp9/vp9_cx_iface.h"
 #include "vp9/vp9_iface_common.h"
 
+#include "vpx/vpx_tpl.h"
+
 typedef struct vp9_extracfg {
   int cpu_used;  // available cpu percentage in 1/16
   unsigned int enable_auto_alt_ref;
@@ -129,6 +131,8 @@ struct vpx_codec_alg_priv {
   BufferPool *buffer_pool;
 };
 
+// Called by encoder_set_config() and encoder_encode() only. Must not be called
+// by encoder_init().
 static vpx_codec_err_t update_error_state(
     vpx_codec_alg_priv_t *ctx, const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
@@ -635,8 +639,12 @@ static vpx_codec_err_t set_encoder_config(
 
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
     for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
-      oxcf->layer_target_bitrate[sl * oxcf->ts_number_layers + tl] =
-          1000 * cfg->layer_target_bitrate[sl * oxcf->ts_number_layers + tl];
+      const int layer = sl * oxcf->ts_number_layers + tl;
+      if (cfg->layer_target_bitrate[layer] > INT_MAX / 1000)
+        oxcf->layer_target_bitrate[layer] = INT_MAX;
+      else
+        oxcf->layer_target_bitrate[layer] =
+            1000 * cfg->layer_target_bitrate[layer];
     }
   }
   if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) {
@@ -789,10 +797,22 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
       ERROR("Cannot change width or height after initialization");
-    if (!valid_ref_frame_size(ctx->cfg.g_w, ctx->cfg.g_h, cfg->g_w, cfg->g_h) ||
+    // Note: function encoder_set_config() is allowed to be called multiple
+    // times. However, when the original frame width or height is less than two
+    // times of the new frame width or height, a forced key frame should be
+    // used. To make sure the correct detection of a forced key frame, we need
+    // to update the frame width and height only when the actual encoding is
+    // performed. cpi->last_coded_width and cpi->last_coded_height are used to
+    // track the actual coded frame size.
+    if ((ctx->cpi->last_coded_width && ctx->cpi->last_coded_height &&
+         !valid_ref_frame_size(ctx->cpi->last_coded_width,
+                               ctx->cpi->last_coded_height, cfg->g_w,
+                               cfg->g_h)) ||
         (ctx->cpi->initial_width && (int)cfg->g_w > ctx->cpi->initial_width) ||
-        (ctx->cpi->initial_height && (int)cfg->g_h > ctx->cpi->initial_height))
+        (ctx->cpi->initial_height &&
+         (int)cfg->g_h > ctx->cpi->initial_height)) {
       force_key = 1;
+    }
   }
 
   // Prevent increasing lag_in_frames. This check is stricter than it needs
@@ -813,6 +833,7 @@ static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
     assert(codec_err != VPX_CODEC_OK);
     return codec_err;
   }
+  ctx->cpi->common.error.setjmp = 1;
 
   ctx->cfg = *cfg;
   set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
@@ -1068,6 +1089,7 @@ static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
     cpi->compute_frame_low_motion_onepass = 0;
     cpi->rc.constrain_gf_key_freq_onepass_vbr = 0;
     cpi->cyclic_refresh->content_mode = 0;
+    cpi->disable_scene_detection_rtc_ratectrl = 1;
   }
   return VPX_CODEC_OK;
 }
@@ -1300,6 +1322,9 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
   if (cpi == NULL) return VPX_CODEC_INVALID_PARAM;
 
+  cpi->last_coded_width = ctx->oxcf.width;
+  cpi->last_coded_height = ctx->oxcf.height;
+
   if (img != NULL) {
     res = validate_img(ctx, img);
     if (res == VPX_CODEC_OK) {
@@ -1631,13 +1656,9 @@ static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
 
   if (data) {
     vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
-
-    if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
-                         roi->delta_q, roi->delta_lf, roi->skip,
-                         roi->ref_frame)) {
-      return VPX_CODEC_OK;
-    }
-    return VPX_CODEC_INVALID_PARAM;
+    return vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+                           roi->delta_q, roi->delta_lf, roi->skip,
+                           roi->ref_frame);
   }
   return VPX_CODEC_INVALID_PARAM;
 }
@@ -1675,9 +1696,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
 
   if (mode) {
-    const int res =
-        vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
-                              (VPX_SCALING)mode->v_scaling_mode);
+    const int res = vp9_set_internal_size(ctx->cpi, mode->h_scaling_mode,
+                                          mode->v_scaling_mode);
     return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
   }
   return VPX_CODEC_INVALID_PARAM;
@@ -1933,16 +1953,28 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
     const FRAME_INFO *frame_info = &cpi->frame_info;
     vpx_rc_config_t ratectrl_config;
     vpx_codec_err_t codec_status;
+    memset(&ratectrl_config, 0, sizeof(ratectrl_config));
 
     ratectrl_config.frame_width = frame_info->frame_width;
     ratectrl_config.frame_height = frame_info->frame_height;
     ratectrl_config.show_frame_count = cpi->twopass.first_pass_info.num_frames;
-
+    ratectrl_config.max_gf_interval = oxcf->max_gf_interval;
+    ratectrl_config.min_gf_interval = oxcf->min_gf_interval;
     // TODO(angiebird): Double check whether this is the proper way to set up
     // target_bitrate and frame_rate.
     ratectrl_config.target_bitrate_kbps = (int)(oxcf->target_bandwidth / 1000);
     ratectrl_config.frame_rate_num = oxcf->g_timebase.den;
     ratectrl_config.frame_rate_den = oxcf->g_timebase.num;
+    ratectrl_config.overshoot_percent = oxcf->over_shoot_pct;
+    ratectrl_config.undershoot_percent = oxcf->under_shoot_pct;
+
+    if (oxcf->rc_mode == VPX_VBR) {
+      ratectrl_config.rc_mode = VPX_RC_VBR;
+    } else if (oxcf->rc_mode == VPX_Q) {
+      ratectrl_config.rc_mode = VPX_RC_QMODE;
+    } else if (oxcf->rc_mode == VPX_CQ) {
+      ratectrl_config.rc_mode = VPX_RC_CQ;
+    }
 
     codec_status = vp9_extrc_create(funcs, ratectrl_config, ext_ratectrl);
     if (codec_status != VPX_CODEC_OK) {
@@ -2065,8 +2097,8 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         0,   // rc_resize_allowed
         0,   // rc_scaled_width
         0,   // rc_scaled_height
-        60,  // rc_resize_down_thresold
-        30,  // rc_resize_up_thresold
+        60,  // rc_resize_down_thresh
+        30,  // rc_resize_up_thresh
 
         VPX_VBR,      // rc_end_usage
         { NULL, 0 },  // rc_twopass_stats_in
@@ -2099,7 +2131,7 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
         { 0 },     // ts_rate_decimator
         0,         // ts_periodicity
         { 0 },     // ts_layer_id
-        { 0 },     // layer_taget_bitrate
+        { 0 },     // layer_target_bitrate
         0,         // temporal_layering_mode
         0,         // use_vizier_rc_params
         { 1, 1 },  // active_wq_factor
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index bdfe21793..a242c776c 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -256,6 +256,7 @@ static void set_ppflags(const vpx_codec_alg_priv_t *ctx, vp9_ppflags_t *flags) {
   } while (0)
 
 static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
+  vpx_codec_err_t res;
   ctx->last_show_frame = -1;
   ctx->need_resync = 1;
   ctx->flushed = 0;
@@ -265,6 +266,8 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
 
   ctx->pbi = vp9_decoder_create(ctx->buffer_pool);
   if (ctx->pbi == NULL) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
     set_error_detail(ctx, "Failed to allocate decoder");
     return VPX_CODEC_MEM_ERROR;
   }
@@ -282,7 +285,14 @@ static vpx_codec_err_t init_decoder(vpx_codec_alg_priv_t *ctx) {
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  return init_buffer_callbacks(ctx);
+  res = init_buffer_callbacks(ctx);
+  if (res != VPX_CODEC_OK) {
+    vpx_free(ctx->buffer_pool);
+    ctx->buffer_pool = NULL;
+    vp9_decoder_remove(ctx->pbi);
+    ctx->pbi = NULL;
+  }
+  return res;
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
@@ -348,7 +358,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Initialize the decoder on the first frame.
   if (ctx->pbi == NULL) {
-    const vpx_codec_err_t res = init_decoder(ctx);
+    res = init_decoder(ctx);
     if (res != VPX_CODEC_OK) return res;
   }
 
@@ -367,7 +377,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     for (i = 0; i < frame_count; ++i) {
       const uint8_t *data_start_copy = data_start;
       const uint32_t frame_size = frame_sizes[i];
-      vpx_codec_err_t res;
       if (data_start < data || frame_size > (uint32_t)(data_end - data_start)) {
         set_error_detail(ctx, "Invalid frame size in index");
         return VPX_CODEC_CORRUPT_FRAME;
@@ -382,8 +391,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
     const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
-      const vpx_codec_err_t res =
-          decode_one(ctx, &data_start, frame_size, user_priv, deadline);
+      res = decode_one(ctx, &data_start, frame_size, user_priv, deadline);
       if (res != VPX_CODEC_OK) return res;
 
       // Account for suboptimal termination by the encoder.
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 9072628f2..44790ef6a 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -40,6 +40,7 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_firstpass_stats.h
 VP9_CX_SRCS-yes += encoder/vp9_frame_scale.c
 VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
@@ -104,20 +105,24 @@ VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += common/vp9_postproc.c
 endif
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS-yes += encoder/vp9_temporal_filter.h
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.c
+VP9_CX_SRCS-yes += encoder/vp9_tpl_model.h
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/vp9_temporal_filter_constants.h
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/vp9_temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
@@ -134,11 +139,12 @@ endif
 
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_avx2.c
 
-ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_error_neon.c
-endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_frame_scale_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_highbd_error_neon.c
+endif
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
 
@@ -156,8 +162,10 @@ VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_firstpass.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_mbgraph.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_sse4.c
-VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/temporal_filter_constants.h
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_temporal_filter_constants.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/x86/highbd_temporal_filter_sse4.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_temporal_filter_neon.c
+VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/arm/neon/vp9_highbd_temporal_filter_neon.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.h
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_alt_ref_aq.c
 VP9_CX_SRCS_REMOVE-$(CONFIG_REALTIME_ONLY) += encoder/vp9_aq_variance.c
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab05099f..f0b46aa17 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@ text vpx_img_flip
 text vpx_img_free
 text vpx_img_set_rect
 text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 670fe380e..aae321873 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -48,6 +48,8 @@
 #include "../vpx_encoder.h"
 #include <stdarg.h>
 
+#include "vpx_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -427,6 +429,27 @@ struct vpx_internal_error_info {
   jmp_buf jmp;
 };
 
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(error, lval, expr)                                  \
+  do {                                                                      \
+    assert((error)->setjmp);                                                \
+    (lval) = (expr);                                                        \
+    if (!(lval))                                                            \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,                        \
+                         "Failed to allocate " #lval " at %s:%d", __FILE__, \
+                         __LINE__);                                         \
+  } while (0)
+#else
+#define CHECK_MEM_ERROR(error, lval, expr)             \
+  do {                                                 \
+    assert((error)->setjmp);                           \
+    (lval) = (expr);                                   \
+    if (!(lval))                                       \
+      vpx_internal_error(error, VPX_CODEC_MEM_ERROR,   \
+                         "Failed to allocate " #lval); \
+  } while (0)
+#endif
+
 #define CLANG_ANALYZER_NORETURN
 #if defined(__has_feature)
 #if __has_feature(attribute_analyzer_noreturn)
diff --git a/vpx/internal/vpx_ratectrl_rtc.h b/vpx/internal/vpx_ratectrl_rtc.h
index 65398c654..01d64b14b 100644
--- a/vpx/internal/vpx_ratectrl_rtc.h
+++ b/vpx/internal/vpx_ratectrl_rtc.h
@@ -14,6 +14,14 @@
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx {
+
+enum class RcFrameType { kKeyFrame = 0, kInterFrame = 1 };
+
+enum class FrameDropDecision {
+  kOk,    // Frame is encoded.
+  kDrop,  // Frame is dropped.
+};
+
 struct VpxRateControlRtcConfig {
  public:
   VpxRateControlRtcConfig() {
@@ -34,6 +42,8 @@ struct VpxRateControlRtcConfig {
     aq_mode = 0;
     layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     ts_rate_decimator[0] = 1;
+    frame_drop_thresh = 0;
+    is_screen = false;
   }
 
   int width;
@@ -57,6 +67,8 @@ struct VpxRateControlRtcConfig {
   // vbr, cbr
   enum vpx_rc_mode rc_mode;
   int aq_mode;
+  int frame_drop_thresh;
+  bool is_screen;
 };
 }  // namespace libvpx
 #endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
diff --git a/vpx/src/vpx_codec.c b/vpx/src/vpx_codec.c
index 114b94e19..24528d860 100644
--- a/vpx/src/vpx_codec.c
+++ b/vpx/src/vpx_codec.c
@@ -50,12 +50,12 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err) {
   return "Unrecognized error code";
 }
 
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx) {
   return (ctx) ? vpx_codec_err_to_string(ctx->err)
                : vpx_codec_err_to_string(VPX_CODEC_INVALID_PARAM);
 }
 
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx) {
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx) {
   if (ctx && ctx->err)
     return ctx->priv ? ctx->priv->err_detail : ctx->err_detail;
 
@@ -82,7 +82,7 @@ vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx) {
 }
 
 vpx_codec_caps_t vpx_codec_get_caps(vpx_codec_iface_t *iface) {
-  return (iface) ? iface->caps : 0;
+  return iface ? iface->caps : 0;
 }
 
 vpx_codec_err_t vpx_codec_control_(vpx_codec_ctx_t *ctx, int ctrl_id, ...) {
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index 846638fe5..0d6e48015 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -54,6 +54,10 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
     res = ctx->iface->init(ctx, NULL);
 
     if (res) {
+      // IMPORTANT: ctx->priv->err_detail must be null or point to a string
+      // that remains valid after ctx->priv is destroyed, such as a C string
+      // literal. This makes it safe to call vpx_codec_error_detail() after
+      // vpx_codec_enc_init_ver() failed.
       ctx->err_detail = ctx->priv ? ctx->priv->err_detail : NULL;
       vpx_codec_destroy(ctx);
     }
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 000000000..62c2a9c85
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+  do {                            \
+    if (expr < 0) {               \
+      return VPX_CODEC_ERROR;     \
+    }                             \
+  } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+  do {                                           \
+    if (expr != expected_value) {                \
+      return VPX_CODEC_ERROR;                    \
+    }                                            \
+  } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats) {
+  int i;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+  for (i = 0; i < tpl_gop_stats->size; i++) {
+    VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+    const int num_blocks = frame_stats.num_blocks;
+    int block;
+    CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+                                frame_stats.frame_height, num_blocks));
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+      CHECK_FPRINTF_ERROR(
+          fprintf(tpl_file,
+                  "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+                  " %" PRId64 " %d\n",
+                  block_stats.inter_cost, block_stats.intra_cost,
+                  block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+                  block_stats.recrf_rate, block_stats.ref_frame_index));
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats) {
+  int i, frame_list_size;
+  if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+  CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+  tpl_gop_stats->size = frame_list_size;
+  tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+      frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+  if (tpl_gop_stats->frame_stats_list == NULL) {
+    return VPX_CODEC_MEM_ERROR;
+  }
+  for (i = 0; i < frame_list_size; i++) {
+    VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+    int num_blocks, width, height, block;
+    CHECK_FSCANF_ERROR(
+        fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+    frame_stats->num_blocks = num_blocks;
+    frame_stats->frame_width = width;
+    frame_stats->frame_height = height;
+    frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+        num_blocks, sizeof(frame_stats->block_stats_list[0]));
+    if (frame_stats->block_stats_list == NULL) {
+      vpx_free_tpl_gop_stats(tpl_gop_stats);
+      return VPX_CODEC_MEM_ERROR;
+    }
+    for (block = 0; block < num_blocks; block++) {
+      VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+      CHECK_FSCANF_ERROR(
+          fscanf(tpl_file,
+                 "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+                 " %" SCNd64 " %d\n",
+                 &block_stats->inter_cost, &block_stats->intra_cost,
+                 &block_stats->mv_c, &block_stats->mv_r,
+                 &block_stats->recrf_dist, &block_stats->recrf_rate,
+                 &block_stats->ref_frame_index),
+          7);
+    }
+  }
+
+  return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats) {
+  int frame;
+  if (tpl_gop_stats == NULL) return;
+  for (frame = 0; frame < tpl_gop_stats->size; frame++) {
+    vpx_free(tpl_gop_stats->frame_stats_list[frame].block_stats_list);
+  }
+  vpx_free(tpl_gop_stats->frame_stats_list);
+}
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index e0b679fbb..2875e185e 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -166,6 +166,7 @@ enum vp8e_enc_control_id {
    *
    * \note Valid range for VP8: -16..16
    * \note Valid range for VP9: -9..9
+   * \note A negative value (-n) is treated as its absolute value (n) in VP9.
    *
    * Supported in codecs: VP8, VP9
    */
@@ -302,7 +303,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP9
@@ -598,7 +599,7 @@ enum vp8e_enc_control_id {
    * the feature is off, i.e., no golden frame boost in CBR mode and
    * average bitrate target is used.
    *
-   * For example, to allow 100% more bits, i.e, 2X, in a golden frame
+   * For example, to allow 100% more bits, i.e., 2X, in a golden frame
    * than average frame, set this to 100.
    *
    * Supported in codecs: VP8
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index b0a931e01..0d61b0738 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -318,19 +318,21 @@ const char *vpx_codec_err_to_string(vpx_codec_err_t err);
  * \param[in]    ctx     Pointer to this instance's context.
  *
  */
-const char *vpx_codec_error(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error(const vpx_codec_ctx_t *ctx);
 
 /*!\brief Retrieve detailed error information for codec context
  *
  * Returns a human readable string providing detailed information about
- * the last error.
+ * the last error. The returned string is only valid until the next
+ * vpx_codec_* function call (except vpx_codec_error and
+ * vpx_codec_error_detail) on the codec context.
  *
  * \param[in]    ctx     Pointer to this instance's context.
  *
  * \retval NULL
  *     No detailed information is available.
  */
-const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
+const char *vpx_codec_error_detail(const vpx_codec_ctx_t *ctx);
 
 /* REQUIRED FUNCTIONS
  *
@@ -345,9 +347,11 @@ const char *vpx_codec_error_detail(vpx_codec_ctx_t *ctx);
  * \param[in] ctx   Pointer to this instance's context
  *
  * \retval #VPX_CODEC_OK
- *     The codec algorithm initialized.
- * \retval #VPX_CODEC_MEM_ERROR
- *     Memory allocation failed.
+ *     The codec instance has been destroyed.
+ * \retval #VPX_CODEC_INVALID_PARAM
+ *     ctx is a null pointer.
+ * \retval #VPX_CODEC_ERROR
+ *     Codec context not initialized.
  */
 vpx_codec_err_t vpx_codec_destroy(vpx_codec_ctx_t *ctx);
 
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index de86579d5..25c815ef5 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h
 API_DOC_SRCS-yes += vpx_ext_ratectrl.h
 API_DOC_SRCS-yes += vpx_frame_buffer.h
 API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
 
 API_SRCS-yes += src/vpx_decoder.c
 API_SRCS-yes += vpx_decoder.h
@@ -36,9 +37,11 @@ API_SRCS-yes += internal/vpx_codec_internal.h
 API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
 API_SRCS-yes += vpx_image.h
 API_SRCS-yes += vpx_integer.h
 API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_decoder.h b/vpx/vpx_decoder.h
index 39e5f585f..99dd8cf69 100644
--- a/vpx/vpx_decoder.h
+++ b/vpx/vpx_decoder.h
@@ -127,7 +127,7 @@ typedef struct vpx_codec_dec_cfg {
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_DECODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The decoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index efaf5ef36..c45d1a2ba 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,6 +31,7 @@ extern "C" {
 
 #include "./vpx_codec.h"
 #include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
 
 /*! Temporal Scalability: Maximum length of the sequence defining frame
  * layer membership
@@ -57,9 +58,9 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures
  */
-#define VPX_ENCODER_ABI_VERSION \
-  (15 + VPX_CODEC_ABI_VERSION + \
-   VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION                                \
+  (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+   VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
 
 /*! \brief Encoder capabilities bitfield
  *
@@ -858,7 +859,7 @@ typedef struct vpx_svc_parameters {
 
 /*!\brief Initialize an encoder instance
  *
- * Initializes a encoder context using the given interface. Applications
+ * Initializes an encoder context using the given interface. Applications
  * should call the vpx_codec_enc_init convenience macro instead of this
  * function directly, to ensure that the ABI version number parameter
  * is properly initialized.
@@ -867,6 +868,9 @@ typedef struct vpx_svc_parameters {
  * is not thread safe and should be guarded with a lock if being used
  * in a multithreaded context.
  *
+ * If vpx_codec_enc_init_ver() fails, it is not necessary to call
+ * vpx_codec_destroy() on the encoder context.
+ *
  * \param[in]    ctx     Pointer to this instance's context.
  * \param[in]    iface   Pointer to the algorithm interface to use.
  * \param[in]    cfg     Configuration to use, if known. May be NULL.
@@ -906,7 +910,7 @@ vpx_codec_err_t vpx_codec_enc_init_ver(vpx_codec_ctx_t *ctx,
  * \param[in]    ver     ABI version number. Must be set to
  *                       VPX_ENCODER_ABI_VERSION
  * \retval #VPX_CODEC_OK
- *     The decoder algorithm initialized.
+ *     The encoder algorithm has been initialized.
  * \retval #VPX_CODEC_MEM_ERROR
  *     Memory allocation failed.
  */
diff --git a/vpx/vpx_ext_ratectrl.h b/vpx/vpx_ext_ratectrl.h
index 3c5fc8cfc..46d290dff 100644
--- a/vpx/vpx_ext_ratectrl.h
+++ b/vpx/vpx_ext_ratectrl.h
@@ -16,6 +16,7 @@ extern "C" {
 #endif
 
 #include "./vpx_integer.h"
+#include "./vpx_tpl.h"
 
 /*!\brief Current ABI version number
  *
@@ -25,7 +26,7 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (6)
+#define VPX_EXT_RATECTRL_ABI_VERSION (7)
 
 /*!\brief The control type of the inference API.
  * In VPX_RC_QP mode, the external rate control model determines the
@@ -47,6 +48,14 @@ typedef enum vpx_rc_type {
   VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
 } vpx_rc_type_t;
 
+/*!\brief The rate control mode for the external rate control model.
+ */
+typedef enum vpx_ext_rc_mode {
+  VPX_RC_QMODE = 0,
+  VPX_RC_VBR = 1,
+  VPX_RC_CQ = 2,
+} vpx_ext_rc_mode_t;
+
 /*!\brief Abstract rate control model handler
  *
  * The encoder will receive the model handler from create_model() defined in
@@ -271,6 +280,10 @@ typedef struct vpx_rc_frame_stats {
    * number of frames whose stats are accumulated.
    */
   double count;
+  /*!
+   * Number of new mv in a frame.
+   */
+  double new_mv_count;
 } vpx_rc_frame_stats_t;
 
 /*!\brief Collection of first pass frame stats
@@ -294,12 +307,21 @@ typedef struct vpx_rc_config {
   int frame_width;      /**< frame width */
   int frame_height;     /**< frame height */
   int show_frame_count; /**< number of visible frames in the video */
+  int max_gf_interval;  /**< max GOP size in number of show frames */
+  int min_gf_interval;  /**< min GOP size in number of show frames */
   /*!
    * Target bitrate in kilobytes per second
    */
   int target_bitrate_kbps;
   int frame_rate_num; /**< numerator of frame rate */
   int frame_rate_den; /**< denominator of frame rate */
+  /*!
+   * The following fields are only for external rate control models that support
+   * different rate control modes.
+   */
+  vpx_ext_rc_mode_t rc_mode; /**< Q mode or VBR mode */
+  int overshoot_percent;     /**< for VBR mode only */
+  int undershoot_percent;    /**< for VBR mode only */
 } vpx_rc_config_t;
 
 /*!\brief Information passed to the external rate control model to
@@ -385,13 +407,13 @@ typedef struct vpx_rc_gop_decision {
  * This callback is invoked by the encoder to create an external rate control
  * model.
  *
- * \param[in]  priv               Callback's private data
- * \param[in]  ratectrl_config    Pointer to vpx_rc_config_t
- * \param[out] rate_ctrl_model_pt Pointer to vpx_rc_model_t
+ * \param[in]  priv                Callback's private data
+ * \param[in]  ratectrl_config     Pointer to vpx_rc_config_t
+ * \param[out] rate_ctrl_model_ptr Pointer to vpx_rc_model_t
  */
 typedef vpx_rc_status_t (*vpx_rc_create_model_cb_fn_t)(
     void *priv, const vpx_rc_config_t *ratectrl_config,
-    vpx_rc_model_t *rate_ctrl_model_pt);
+    vpx_rc_model_t *rate_ctrl_model_ptr);
 
 /*!\brief Send first pass stats to the external rate control model callback
  * prototype
@@ -406,6 +428,18 @@ typedef vpx_rc_status_t (*vpx_rc_send_firstpass_stats_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_firstpass_stats_t *first_pass_stats);
 
+/*!\brief Send TPL stats for the current GOP to the external rate control model
+ * callback prototype
+ *
+ * This callback is invoked by the encoder to send TPL stats for the GOP to the
+ * external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  tpl_gop_stats    TPL stats for current GOP
+ */
+typedef vpx_rc_status_t (*vpx_rc_send_tpl_gop_stats_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const VpxTplGopStats *tpl_gop_stats);
+
 /*!\brief Receive encode frame decision callback prototype
  *
  * This callback is invoked by the encoder to receive encode frame decision from
@@ -488,6 +522,10 @@ typedef struct vpx_rc_funcs {
    */
   vpx_rc_send_firstpass_stats_cb_fn_t send_firstpass_stats;
   /*!
+   * Send TPL stats for current GOP to the external rate control model.
+   */
+  vpx_rc_send_tpl_gop_stats_cb_fn_t send_tpl_gop_stats;
+  /*!
    * Get encodeframe decision from the external rate control model.
    */
   vpx_rc_get_encodeframe_decision_cb_fn_t get_encodeframe_decision;
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 000000000..a250aada6
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,102 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include <stdio.h>
+
+#include "./vpx_integer.h"
+#include "./vpx_codec.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped.  Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (2) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+  int16_t row;         /**< Pixel row of the top left corner */
+  int16_t col;         /**< Pixel col of the top left corner */
+  int64_t intra_cost;  /**< Intra cost */
+  int64_t inter_cost;  /**< Inter cost */
+  int16_t mv_r;        /**< Motion vector row */
+  int16_t mv_c;        /**< Motion vector col */
+  int64_t recrf_rate;  /**< Rate from reconstructed ref frame */
+  int64_t recrf_dist;  /**< Distortion from reconstructed ref frame */
+  int ref_frame_index; /**< Ref frame index in the ref frame buffer */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+  int frame_width;  /**< Frame width */
+  int frame_height; /**< Frame height */
+  int num_blocks;   /**< Number of blocks. Size of block_stats_list */
+  VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+  int size; /**< GOP size, also the size of frame_stats_list. */
+  VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in]    tpl_file       A FILE pointer that's already been opened.
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+                                        const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in]     tpl_file       A FILE pointer that's already been opened.
+ * \param[out]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                               whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+                                       VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in]    tpl_gop_stats  VpxTplGopStats that contains TPL stats for the
+ *                              whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_VPX_TPL_H_
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index 8e57bdaa5..1b17a326b 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -46,96 +46,93 @@ uint32_t vpx_avg_8x8_neon(const uint8_t *a, int a_stride) {
 
 // coeff: 16 bits, dynamic range [-32640, 32640].
 // length: value range {16, 64, 256, 1024}.
+// satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
 int vpx_satd_neon(const tran_low_t *coeff, int length) {
-  const int16x4_t zero = vdup_n_s16(0);
-  int32x4_t accum = vdupq_n_s32(0);
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
 
   do {
-    const int16x8_t src0 = load_tran_low_to_s16q(coeff);
-    const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
-    accum = vabal_s16(accum, vget_low_s16(src0), zero);
-    accum = vabal_s16(accum, vget_high_s16(src0), zero);
-    accum = vabal_s16(accum, vget_low_s16(src8), zero);
-    accum = vabal_s16(accum, vget_high_s16(src8), zero);
+    int16x8_t abs0, abs1;
+    const int16x8_t s0 = load_tran_low_to_s16q(coeff);
+    const int16x8_t s1 = load_tran_low_to_s16q(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    sum_s32[0] = vpadalq_s16(sum_s32[0], abs0);
+    abs1 = vabsq_s16(s1);
+    sum_s32[1] = vpadalq_s16(sum_s32[1], abs1);
+
     length -= 16;
     coeff += 16;
   } while (length != 0);
 
-  {
-    // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
-    const int64x2_t s0 = vpaddlq_s32(accum);  // cascading summation of 'accum'.
-    const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)),
-                                  vreinterpret_s32_s64(vget_high_s64(s0)));
-    const int satd = vget_lane_s32(s1, 0);
-    return satd;
-  }
+  return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
 }
 
 void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
                           const int ref_stride, const int height) {
   int i;
-  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
-  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
-  const int shift_factor = ((height >> 5) + 3) * -1;
-  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+  uint8x16_t r0, r1, r2, r3;
+  uint16x8_t sum_lo[2], sum_hi[2];
+  uint16x8_t tmp_lo[2], tmp_hi[2];
+  int16x8_t avg_lo, avg_hi;
 
-  for (i = 0; i < height; i += 8) {
-    const uint8x16_t vec_row1 = vld1q_u8(ref);
-    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
-    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
-    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
-    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
-    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
-    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
-    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+  const int norm_factor = (height >> 5) + 3;
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+  assert(height >= 4 && height % 4 == 0);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+  r0 = vld1q_u8(ref + 0 * ref_stride);
+  r1 = vld1q_u8(ref + 1 * ref_stride);
+  r2 = vld1q_u8(ref + 2 * ref_stride);
+  r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+  sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+  sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+  sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+  sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+  ref += 4 * ref_stride;
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+  for (i = 4; i < height; i += 4) {
+    r0 = vld1q_u8(ref + 0 * ref_stride);
+    r1 = vld1q_u8(ref + 1 * ref_stride);
+    r2 = vld1q_u8(ref + 2 * ref_stride);
+    r3 = vld1q_u8(ref + 3 * ref_stride);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+    tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+    sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+    sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+    sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
 
-    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
-    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
-
-    ref += ref_stride * 8;
+    ref += 4 * ref_stride;
   }
 
-  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
-  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+  sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+  sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+  avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+  avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
 
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
-  hbuf += 8;
-  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+  vst1q_s16(hbuf, avg_lo);
+  vst1q_s16(hbuf + 8, avg_hi);
 }
 
 int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+  uint16x8_t sum;
   int i;
-  uint16x8_t vec_sum = vdupq_n_u16(0);
 
-  for (i = 0; i < width; i += 16) {
-    const uint8x16_t vec_row = vld1q_u8(ref);
-    vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row));
-    vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row));
-    ref += 16;
+  assert(width >= 16 && width % 16 == 0);
+
+  sum = vpaddlq_u8(vld1q_u8(ref));
+  for (i = 16; i < width; i += 16) {
+    sum = vpadalq_u8(sum, vld1q_u8(ref + i));
   }
 
-  return (int16_t)horizontal_add_uint16x8(vec_sum);
+  return (int16_t)horizontal_add_uint16x8(sum);
 }
 
 // ref, src = [0, 510] - max diff = 16-bits
@@ -214,11 +211,16 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-  // Split to D and start doing pairwise.
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+  *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
   uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
   uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
 
-  // Enough runs of vpmax/min propogate the max/min values to every position.
+  // Enough runs of vpmax/min propagate the max/min values to every position.
   ab_max = vpmax_u8(ab_max, ab_max);
   ab_min = vpmin_u8(ab_min, ab_min);
 
@@ -232,4 +234,5 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   // Store directly to avoid costly neon->gpr transfer.
   vst1_lane_u8((uint8_t *)max, ab_max, 0);
   vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
 }
diff --git a/vpx_dsp/arm/fdct16x16_neon.c b/vpx_dsp/arm/fdct16x16_neon.c
index a458ecaa4..fde71ff30 100644
--- a/vpx_dsp/arm/fdct16x16_neon.c
+++ b/vpx_dsp/arm/fdct16x16_neon.c
@@ -28,6 +28,124 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #else
 
+// Main body of fdct16x16.
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
+                      &out[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
+                      &out[15]);
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
+                      &out[3]);
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
+                      &out[11]);
+}
+
 void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[16];
   int16x8_t temp1[16];
@@ -47,8 +165,8 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
 
-  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
-  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
+  transpose_s16_8x8q(&temp0[0], &temp2[0]);
+  transpose_s16_8x8q(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3);
   vpx_fdct8x16_body(temp3, temp2);
@@ -62,7 +180,7 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+  transpose_s16_8x8q(&temp0[8], &temp1[0]);
 
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
@@ -79,6 +197,194 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
 #if CONFIG_VP9_HIGHBITDEPTH
 
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+}
+
 void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
                                int stride) {
   int16x8_t temp0[16];
diff --git a/vpx_dsp/arm/fdct16x16_neon.h b/vpx_dsp/arm/fdct16x16_neon.h
index 43d820b6b..cd58675ca 100644
--- a/vpx_dsp/arm/fdct16x16_neon.h
+++ b/vpx_dsp/arm/fdct16x16_neon.h
@@ -159,124 +159,6 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
   a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
 }
 
-// Main body of fdct16x16.
-static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
-                              int16x8_t *out /*[16]*/) {
-  int16x8_t s[8];
-  int16x8_t x[4];
-  int16x8_t step[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  s[0] = vaddq_s16(in[0], in[7]);
-  s[1] = vaddq_s16(in[1], in[6]);
-  s[2] = vaddq_s16(in[2], in[5]);
-  s[3] = vaddq_s16(in[3], in[4]);
-  s[4] = vsubq_s16(in[3], in[4]);
-  s[5] = vsubq_s16(in[2], in[5]);
-  s[6] = vsubq_s16(in[1], in[6]);
-  s[7] = vsubq_s16(in[0], in[7]);
-
-  // fdct4(step, step);
-  x[0] = vaddq_s16(s[0], s[3]);
-  x[1] = vaddq_s16(s[1], s[2]);
-  x[2] = vsubq_s16(s[1], s[2]);
-  x[3] = vsubq_s16(s[0], s[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
-                                          &out[8]);
-  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
-  //  Stage 3
-  x[0] = vaddq_s16(s[4], s[5]);
-  x[1] = vsubq_s16(s[4], s[5]);
-  x[2] = vsubq_s16(s[7], s[6]);
-  x[3] = vaddq_s16(s[7], s[6]);
-
-  // Stage 4
-  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
-  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
-  // step 3
-  s[0] = vaddq_s16(in[8], s[3]);
-  s[1] = vaddq_s16(in[9], s[2]);
-  x[0] = vsubq_s16(in[9], s[2]);
-  x[1] = vsubq_s16(in[8], s[3]);
-  x[2] = vsubq_s16(in[15], s[4]);
-  x[3] = vsubq_s16(in[14], s[5]);
-  s[6] = vaddq_s16(in[14], s[5]);
-  s[7] = vaddq_s16(in[15], s[4]);
-
-  // step 4
-  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
-  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
-  // * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
-
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
-  // cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
-
-  // step 5
-  step[0] = vaddq_s16(s[0], s[1]);
-  step[1] = vsubq_s16(s[0], s[1]);
-  step[2] = vaddq_s16(x[1], s[2]);
-  step[3] = vsubq_s16(x[1], s[2]);
-  step[4] = vsubq_s16(x[2], s[5]);
-  step[5] = vaddq_s16(x[2], s[5]);
-  step[6] = vsubq_s16(s[7], s[6]);
-  step[7] = vaddq_s16(s[7], s[6]);
-
-  // step 6
-  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
-  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
-  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
-                      &out[7]);
-  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
-  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
-  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
-                      &out[15]);
-
-  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
-  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
-  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
-                      &out[3]);
-
-  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
-  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
-  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
-                      &out[11]);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
@@ -431,194 +313,6 @@ static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
   vst1q_s32(a, b[15]);
 }
 
-// Main body of fdct8x16 column
-static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
-                                     int32x4_t *right /* [16] */) {
-  int32x4_t sl[8];
-  int32x4_t sr[8];
-  int32x4_t xl[4];
-  int32x4_t xr[4];
-  int32x4_t inl[8];
-  int32x4_t inr[8];
-  int32x4_t stepl[8];
-  int32x4_t stepr[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  sl[0] = vaddq_s32(left[0], left[7]);
-  sr[0] = vaddq_s32(right[0], right[7]);
-  sl[1] = vaddq_s32(left[1], left[6]);
-  sr[1] = vaddq_s32(right[1], right[6]);
-  sl[2] = vaddq_s32(left[2], left[5]);
-  sr[2] = vaddq_s32(right[2], right[5]);
-  sl[3] = vaddq_s32(left[3], left[4]);
-  sr[3] = vaddq_s32(right[3], right[4]);
-  sl[4] = vsubq_s32(left[3], left[4]);
-  sr[4] = vsubq_s32(right[3], right[4]);
-  sl[5] = vsubq_s32(left[2], left[5]);
-  sr[5] = vsubq_s32(right[2], right[5]);
-  sl[6] = vsubq_s32(left[1], left[6]);
-  sr[6] = vsubq_s32(right[1], right[6]);
-  sl[7] = vsubq_s32(left[0], left[7]);
-  sr[7] = vsubq_s32(right[0], right[7]);
-
-  // Copy values 8-15 as we're storing in-place
-  inl[0] = left[8];
-  inr[0] = right[8];
-  inl[1] = left[9];
-  inr[1] = right[9];
-  inl[2] = left[10];
-  inr[2] = right[10];
-  inl[3] = left[11];
-  inr[3] = right[11];
-  inl[4] = left[12];
-  inr[4] = right[12];
-  inl[5] = left[13];
-  inr[5] = right[13];
-  inl[6] = left[14];
-  inr[6] = right[14];
-  inl[7] = left[15];
-  inr[7] = right[15];
-
-  // fdct4(step, step);
-  xl[0] = vaddq_s32(sl[0], sl[3]);
-  xr[0] = vaddq_s32(sr[0], sr[3]);
-  xl[1] = vaddq_s32(sl[1], sl[2]);
-  xr[1] = vaddq_s32(sr[1], sr[2]);
-  xl[2] = vsubq_s32(sl[1], sl[2]);
-  xr[2] = vsubq_s32(sr[1], sr[2]);
-  xl[3] = vsubq_s32(sl[0], sl[3]);
-  xr[3] = vsubq_s32(sr[0], sr[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
-                               &left[0], &right[0], &left[8], &right[8]);
-
-  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
-                                     cospi_24_64, &left[4], &right[4],
-                                     &left[12], &right[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
-                               &sr[6], &sl[5], &sr[5]);
-
-  //  Stage 3
-  xl[0] = vaddq_s32(sl[4], sl[5]);
-  xr[0] = vaddq_s32(sr[4], sr[5]);
-  xl[1] = vsubq_s32(sl[4], sl[5]);
-  xr[1] = vsubq_s32(sr[4], sr[5]);
-  xl[2] = vsubq_s32(sl[7], sl[6]);
-  xr[2] = vsubq_s32(sr[7], sr[6]);
-  xl[3] = vaddq_s32(sl[7], sl[6]);
-  xr[3] = vaddq_s32(sr[7], sr[6]);
-
-  // Stage 4
-  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
-                                     cospi_28_64, &left[2], &right[2],
-                                     &left[14], &right[14]);
-  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
-                                     cospi_12_64, &left[10], &right[10],
-                                     &left[6], &right[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
-                               &sl[5], &sr[5], &sl[2], &sr[2]);
-  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
-                               &sl[4], &sr[4], &sl[3], &sr[3]);
-
-  // step 3
-  sl[0] = vaddq_s32(inl[0], sl[3]);
-  sr[0] = vaddq_s32(inr[0], sr[3]);
-  sl[1] = vaddq_s32(inl[1], sl[2]);
-  sr[1] = vaddq_s32(inr[1], sr[2]);
-  xl[0] = vsubq_s32(inl[1], sl[2]);
-  xr[0] = vsubq_s32(inr[1], sr[2]);
-  xl[1] = vsubq_s32(inl[0], sl[3]);
-  xr[1] = vsubq_s32(inr[0], sr[3]);
-  xl[2] = vsubq_s32(inl[7], sl[4]);
-  xr[2] = vsubq_s32(inr[7], sr[4]);
-  xl[3] = vsubq_s32(inl[6], sl[5]);
-  xr[3] = vsubq_s32(inr[6], sr[5]);
-  sl[6] = vaddq_s32(inl[6], sl[5]);
-  sr[6] = vaddq_s32(inr[6], sr[5]);
-  sl[7] = vaddq_s32(inl[7], sl[4]);
-  sr[7] = vaddq_s32(inr[7], sr[4]);
-
-  // step 4
-  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
-  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
-  // * cospi_8_64)
-  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
-                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
-                                     &sr[1]);
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
-  // cospi_24_64)
-  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
-                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
-                                     &sr[5]);
-
-  // step 5
-  stepl[0] = vaddq_s32(sl[0], sl[1]);
-  stepr[0] = vaddq_s32(sr[0], sr[1]);
-  stepl[1] = vsubq_s32(sl[0], sl[1]);
-  stepr[1] = vsubq_s32(sr[0], sr[1]);
-  stepl[2] = vaddq_s32(xl[1], sl[2]);
-  stepr[2] = vaddq_s32(xr[1], sr[2]);
-  stepl[3] = vsubq_s32(xl[1], sl[2]);
-  stepr[3] = vsubq_s32(xr[1], sr[2]);
-  stepl[4] = vsubq_s32(xl[2], sl[5]);
-  stepr[4] = vsubq_s32(xr[2], sr[5]);
-  stepl[5] = vaddq_s32(xl[2], sl[5]);
-  stepr[5] = vaddq_s32(xr[2], sr[5]);
-  stepl[6] = vsubq_s32(sl[7], sl[6]);
-  stepr[6] = vsubq_s32(sr[7], sr[6]);
-  stepl[7] = vaddq_s32(sl[7], sl[6]);
-  stepr[7] = vaddq_s32(sr[7], sr[6]);
-
-  // step 6
-  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
-  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
-                                     cospi_18_64, cospi_14_64, &left[9],
-                                     &right[9], &left[7], &right[7]);
-  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
-  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
-                                     cospi_2_64, cospi_30_64, &left[1],
-                                     &right[1], &left[15], &right[15]);
-  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
-  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
-                                     cospi_26_64, cospi_6_64, &left[13],
-                                     &right[13], &left[3], &right[3]);
-  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
-  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
-  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
-                                     cospi_10_64, cospi_22_64, &left[5],
-                                     &right[5], &left[11], &right[11]);
-}
-
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/vpx_dsp/arm/fdct32x32_neon.c b/vpx_dsp/arm/fdct32x32_neon.c
index d6818d2ec..a91730ce8 100644
--- a/vpx_dsp/arm/fdct32x32_neon.c
+++ b/vpx_dsp/arm/fdct32x32_neon.c
@@ -60,10 +60,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -78,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -96,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -114,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -159,10 +159,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
+  transpose_s16_8x8q(&temp1[0], &temp0[0]);
+  transpose_s16_8x8q(&temp2[0], &temp0[8]);
+  transpose_s16_8x8q(&temp3[0], &temp0[16]);
+  transpose_s16_8x8q(&temp4[0], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -177,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
+  transpose_s16_8x8q(&temp1[8], &temp0[0]);
+  transpose_s16_8x8q(&temp2[8], &temp0[8]);
+  transpose_s16_8x8q(&temp3[8], &temp0[16]);
+  transpose_s16_8x8q(&temp4[8], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -195,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
+  transpose_s16_8x8q(&temp1[16], &temp0[0]);
+  transpose_s16_8x8q(&temp2[16], &temp0[8]);
+  transpose_s16_8x8q(&temp3[16], &temp0[16]);
+  transpose_s16_8x8q(&temp4[16], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -213,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
-  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
-  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
-  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
+  transpose_s16_8x8q(&temp1[24], &temp0[0]);
+  transpose_s16_8x8q(&temp2[24], &temp0[8]);
+  transpose_s16_8x8q(&temp3[24], &temp0[16]);
+  transpose_s16_8x8q(&temp4[24], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
diff --git a/vpx_dsp/arm/fdct4x4_neon.c b/vpx_dsp/arm/fdct4x4_neon.c
index 3b9196fae..4bc968ecb 100644
--- a/vpx_dsp/arm/fdct4x4_neon.c
+++ b/vpx_dsp/arm/fdct4x4_neon.c
@@ -52,7 +52,6 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
 void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                              int stride) {
-  static const int32x4_t const_1000 = { 1, 0, 0, 0 };
   const int32x4_t const_one = vdupq_n_s32(1);
 
   // input[M * stride] * 16
@@ -64,7 +63,8 @@ void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
 
   // If the very first value != 0, then add 1.
   if (input[0] != 0) {
-    in[0] = vaddq_s32(in[0], const_1000);
+    static const int32_t k1000[4] = { 1, 0, 0, 0 };
+    in[0] = vaddq_s32(in[0], vld1q_s32(k1000));
   }
 
   vpx_highbd_fdct4x4_pass1_neon(in);
diff --git a/vpx_dsp/arm/fdct8x8_neon.h b/vpx_dsp/arm/fdct8x8_neon.h
index d8fa60044..cc6515743 100644
--- a/vpx_dsp/arm/fdct8x8_neon.h
+++ b/vpx_dsp/arm/fdct8x8_neon.h
@@ -293,88 +293,14 @@ static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
 
 static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
                                                  int32x4_t *right) {
-  int32x4x2_t out[8];
   vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
+  transpose_s32_8x8_2(left, right, left, right);
 }
 
 static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
                                                  int32x4_t *right) {
-  int32x4x2_t out[8];
   vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
-
-  out[0].val[0] = left[0];
-  out[0].val[1] = right[0];
-  out[1].val[0] = left[1];
-  out[1].val[1] = right[1];
-  out[2].val[0] = left[2];
-  out[2].val[1] = right[2];
-  out[3].val[0] = left[3];
-  out[3].val[1] = right[3];
-  out[4].val[0] = left[4];
-  out[4].val[1] = right[4];
-  out[5].val[0] = left[5];
-  out[5].val[1] = right[5];
-  out[6].val[0] = left[6];
-  out[6].val[1] = right[6];
-  out[7].val[0] = left[7];
-  out[7].val[1] = right[7];
-
-  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
-                    &out[6], &out[7]);
-
-  left[0] = out[0].val[0];
-  right[0] = out[0].val[1];
-  left[1] = out[1].val[0];
-  right[1] = out[1].val[1];
-  left[2] = out[2].val[0];
-  right[2] = out[2].val[1];
-  left[3] = out[3].val[0];
-  right[3] = out[3].val[1];
-  left[4] = out[4].val[0];
-  right[4] = out[4].val[1];
-  left[5] = out[5].val[0];
-  right[5] = out[5].val[1];
-  left[6] = out[6].val[0];
-  right[6] = out[6].val[1];
-  left[7] = out[7].val[0];
-  right[7] = out[7].val[1];
+  transpose_s32_8x8_2(left, right, left, right);
 }
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/arm/fdct_neon.h b/vpx_dsp/arm/fdct_neon.h
index 193594e3d..16f5c5fc0 100644
--- a/vpx_dsp/arm/fdct_neon.h
+++ b/vpx_dsp/arm/fdct_neon.h
@@ -177,6 +177,45 @@ static INLINE void butterfly_one_coeff_s32_fast(
   *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
 }
 
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_one_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac holds the following values:
+  // ac: vget_low_s32(a_lo) * c, vget_high_s32(a_lo) * c,
+  //     vget_low_s32(a_hi) * c, vget_high_s32(a_hi) * c
+  int64x2_t ac[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac[0] = vmull_n_s32(vget_low_s32(a_lo), constant);
+  ac[1] = vmull_n_s32(vget_high_s32(a_lo), constant);
+  ac[2] = vmull_n_s32(vget_low_s32(a_hi), constant);
+  ac[3] = vmull_n_s32(vget_high_s32(a_hi), constant);
+
+  sum[0] = vmlal_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  sum[1] = vmlal_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  sum[2] = vmlal_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  sum[3] = vmlal_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac[0], vget_low_s32(b_lo), constant);
+  diff[1] = vmlsl_n_s32(ac[1], vget_high_s32(b_lo), constant);
+  diff[2] = vmlsl_n_s32(ac[2], vget_low_s32(b_hi), constant);
+  diff[3] = vmlsl_n_s32(ac[3], vget_high_s32(b_hi), constant);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
 // fdct_round_shift(a * c1 +/- b * c2)
 // Variant that performs normal implementation on half vector
 // more accurate does 64-bit processing, takes and returns 32-bit values
@@ -207,6 +246,44 @@ static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
 
 // fdct_round_shift(a * c1 +/- b * c2)
 // Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 64-bit values
+// returns results without rounding
+static INLINE void butterfly_two_coeff_s32_s64_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int64x2_t *add_lo /*[2]*/,
+    int64x2_t *add_hi /*[2]*/, int64x2_t *sub_lo /*[2]*/,
+    int64x2_t *sub_hi /*[2]*/) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  add_lo[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  add_lo[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  add_hi[0] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  add_hi[1] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+
+  sub_lo[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  sub_lo[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  sub_hi[0] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  sub_hi[1] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
 // more accurate does 64-bit processing, takes and returns 32-bit values
 // returns narrowed results
 static INLINE void butterfly_two_coeff_s32_s64_narrow(
@@ -420,4 +497,46 @@ static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
   return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
 }
 
+static INLINE int32x4_t add_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vaddq_s64(a[0], b[0]);
+  result[1] = vaddq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t sub_s64_round_narrow(const int64x2_t *a /*[2]*/,
+                                             const int64x2_t *b /*[2]*/) {
+  int64x2_t result[2];
+  result[0] = vsubq_s64(a[0], b[0]);
+  result[1] = vsubq_s64(a[1], b[1]);
+  return vcombine_s32(vrshrn_n_s64(result[0], DCT_CONST_BITS),
+                      vrshrn_n_s64(result[1], DCT_CONST_BITS));
+}
+
+static INLINE int32x4_t add_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vaddq_s64(a64[0], b64[0]);
+  result[1] = vaddq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
+static INLINE int32x4_t sub_s32_s64_narrow(const int32x4_t a,
+                                           const int32x4_t b) {
+  int64x2_t a64[2], b64[2], result[2];
+  a64[0] = vmovl_s32(vget_low_s32(a));
+  a64[1] = vmovl_s32(vget_high_s32(a));
+  b64[0] = vmovl_s32(vget_low_s32(b));
+  b64[1] = vmovl_s32(vget_high_s32(b));
+  result[0] = vsubq_s64(a64[0], b64[0]);
+  result[1] = vsubq_s64(a64[1], b64[1]);
+  return vcombine_s32(vmovn_s64(result[0]), vmovn_s64(result[1]));
+}
+
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index 718dba0d9..df0da543c 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   output[1] = 0;
 }
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// will fail with an internal compiler error.
+// See:
+// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   int r;
   int16x8_t sum = vld1q_s16(&input[0]);
@@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
   output[0] = (tran_low_t)horizontal_add_int16x8(sum);
   output[1] = 0;
 }
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
 
 void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
                           int stride) {
diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c
index f6b6d7e3c..f5a044be4 100644
--- a/vpx_dsp/arm/hadamard_neon.c
+++ b/vpx_dsp/arm/hadamard_neon.c
@@ -138,15 +138,15 @@ void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
     const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
 
-    const int16x8_t b0 = vhaddq_s16(a0, a1);
-    const int16x8_t b1 = vhsubq_s16(a0, a1);
-    const int16x8_t b2 = vhaddq_s16(a2, a3);
-    const int16x8_t b3 = vhsubq_s16(a2, a3);
+    const int16x8_t b0 = vshrq_n_s16(vhaddq_s16(a0, a1), 1);
+    const int16x8_t b1 = vshrq_n_s16(vhsubq_s16(a0, a1), 1);
+    const int16x8_t b2 = vshrq_n_s16(vhaddq_s16(a2, a3), 1);
+    const int16x8_t b3 = vshrq_n_s16(vhsubq_s16(a2, a3), 1);
 
-    const int16x8_t c0 = vhaddq_s16(b0, b2);
-    const int16x8_t c1 = vhaddq_s16(b1, b3);
-    const int16x8_t c2 = vhsubq_s16(b0, b2);
-    const int16x8_t c3 = vhsubq_s16(b1, b3);
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
 
     store_s16q_to_tran_low(coeff + 0, c0);
     store_s16q_to_tran_low(coeff + 256, c1);
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 000000000..4265596c8
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,140 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+uint32_t vpx_highbd_avg_4x4_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16x8_t a0 = load_unaligned_u16q(a_ptr + 0 * p, p);
+  const uint16x8_t a1 = load_unaligned_u16q(a_ptr + 2 * p, p);
+  return (horizontal_add_uint16x8(vaddq_u16(a0, a1)) + (1 << 3)) >> 4;
+}
+
+uint32_t vpx_highbd_avg_8x8_neon(const uint8_t *s8, int p) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, p, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_uint16x8(sum) + (1 << 5)) >> 6;
+}
+
+// coeff: 32 bits, dynamic range [-2147483648, 2147483647].
+// length: value range {16, 64, 256, 1024}.
+// satd: 42 bits, dynamic range [-2147483648 * 1024, 2147483647 * 1024]
+int vpx_highbd_satd_neon(const tran_low_t *coeff, int length) {
+  int64x2_t sum_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int32x4_t abs0, abs1;
+    const int32x4_t s0 = load_tran_low_to_s32q(coeff);
+    const int32x4_t s1 = load_tran_low_to_s32q(coeff + 4);
+
+    abs0 = vabsq_s32(s0);
+    sum_s64[0] = vpadalq_s32(sum_s64[0], abs0);
+    abs1 = vabsq_s32(s1);
+    sum_s64[1] = vpadalq_s32(sum_s64[1], abs1);
+
+    length -= 8;
+    coeff += 8;
+  } while (length != 0);
+
+  return (int)horizontal_add_int64x2(vaddq_s64(sum_s64[0], sum_s64[1]));
+}
+
+void vpx_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+                                int dp, int *min, int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t min0123 = vminq_u16(min01, min23);
+  const uint16x8_t min4567 = vminq_u16(min45, min67);
+  const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if VPX_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint16_t *)max) = vmaxvq_u16(max07);
+  *((uint16_t *)min) = vminvq_u16(min07);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u16((uint16_t *)max, ab_max, 0);
+  vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/vpx_dsp/arm/highbd_avg_pred_neon.c b/vpx_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 000000000..3063acbb3
--- /dev/null
+++ b/vpx_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,64 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
diff --git a/vpx_dsp/arm/highbd_hadamard_neon.c b/vpx_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 000000000..7be88f6bc
--- /dev/null
+++ b/vpx_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  int16x8_t b0 = vaddq_s16(*a0, *a1);
+  int16x8_t b1 = vsubq_s16(*a0, *a1);
+  int16x8_t b2 = vaddq_s16(*a2, *a3);
+  int16x8_t b3 = vsubq_s16(*a2, *a3);
+  int16x8_t b4 = vaddq_s16(*a4, *a5);
+  int16x8_t b5 = vsubq_s16(*a4, *a5);
+  int16x8_t b6 = vaddq_s16(*a6, *a7);
+  int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  int16x8_t c0 = vaddq_s16(b0, b2);
+  int16x8_t c2 = vsubq_s16(b0, b2);
+  int16x8_t c1 = vaddq_s16(b1, b3);
+  int16x8_t c3 = vsubq_s16(b1, b3);
+  int16x8_t c4 = vaddq_s16(b4, b6);
+  int16x8_t c6 = vsubq_s16(b4, b6);
+  int16x8_t c5 = vaddq_s16(b5, b7);
+  int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a2 = vsubq_s16(c0, c4);
+  *a7 = vaddq_s16(c1, c5);
+  *a6 = vsubq_s16(c1, c5);
+  *a3 = vaddq_s16(c2, c6);
+  *a1 = vsubq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+                                                    int16x4_t a2, int16x4_t a3,
+                                                    int16x4_t a4, int16x4_t a5,
+                                                    int16x4_t a6, int16x4_t a7,
+                                                    tran_low_t *coeff) {
+  int32x4_t b0 = vaddl_s16(a0, a1);
+  int32x4_t b1 = vsubl_s16(a0, a1);
+  int32x4_t b2 = vaddl_s16(a2, a3);
+  int32x4_t b3 = vsubl_s16(a2, a3);
+  int32x4_t b4 = vaddl_s16(a4, a5);
+  int32x4_t b5 = vsubl_s16(a4, a5);
+  int32x4_t b6 = vaddl_s16(a6, a7);
+  int32x4_t b7 = vsubl_s16(a6, a7);
+
+  int32x4_t c0 = vaddq_s32(b0, b2);
+  int32x4_t c2 = vsubq_s32(b0, b2);
+  int32x4_t c1 = vaddq_s32(b1, b3);
+  int32x4_t c3 = vsubq_s32(b1, b3);
+  int32x4_t c4 = vaddq_s32(b4, b6);
+  int32x4_t c6 = vsubq_s32(b4, b6);
+  int32x4_t c5 = vaddq_s32(b5, b7);
+  int32x4_t c7 = vsubq_s32(b5, b7);
+
+  int32x4_t d0 = vaddq_s32(c0, c4);
+  int32x4_t d2 = vsubq_s32(c0, c4);
+  int32x4_t d7 = vaddq_s32(c1, c5);
+  int32x4_t d6 = vsubq_s32(c1, c5);
+  int32x4_t d3 = vaddq_s32(c2, c6);
+  int32x4_t d1 = vsubq_s32(c2, c6);
+  int32x4_t d4 = vaddq_s32(c3, c7);
+  int32x4_t d5 = vsubq_s32(c3, c7);
+
+  store_s32q_to_tran_low(coeff + 0, d0);
+  store_s32q_to_tran_low(coeff + 4, d1);
+  store_s32q_to_tran_low(coeff + 8, d2);
+  store_s32q_to_tran_low(coeff + 12, d3);
+  store_s32q_to_tran_low(coeff + 16, d4);
+  store_s32q_to_tran_low(coeff + 20, d5);
+  store_s32q_to_tran_low(coeff + 24, d6);
+  store_s32q_to_tran_low(coeff + 28, d7);
+}
+
+void vpx_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+  int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+  int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+  hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  // For the second pass we need to widen to 32-bit elements, so we're
+  // processing 4 columns at a time.
+  // Skip the second transpose because it is not required.
+
+  b0 = vget_low_s16(s0);
+  b1 = vget_low_s16(s1);
+  b2 = vget_low_s16(s2);
+  b3 = vget_low_s16(s3);
+  b4 = vget_low_s16(s4);
+  b5 = vget_low_s16(s5);
+  b6 = vget_low_s16(s6);
+  b7 = vget_low_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+  b0 = vget_high_s16(s0);
+  b1 = vget_high_s16(s1);
+  b2 = vget_high_s16(s2);
+  b3 = vget_high_s16(s3);
+  b4 = vget_high_s16(s4);
+  b5 = vget_high_s16(s5);
+  b6 = vget_high_s16(s6);
+  b7 = vget_high_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void vpx_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 16x16 to 8x32 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+  // Bottom left.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+                               coeff + 128);
+  // Bottom right.
+  vpx_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+                               coeff + 192);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 64);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 128);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 192);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 64, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 128, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 192, c3);
+  } while (++i < 16);
+}
+
+void vpx_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int i = 0;
+
+  // Rearrange 32x32 to 16x64 and remove stride.
+  // Top left first.
+  vpx_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+  // Top right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+  // Bottom left.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+                                 coeff + 512);
+  // Bottom right.
+  vpx_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+                                 coeff + 768);
+
+  do {
+    int32x4_t a0 = load_tran_low_to_s32q(coeff + 4 * i);
+    int32x4_t a1 = load_tran_low_to_s32q(coeff + 4 * i + 256);
+    int32x4_t a2 = load_tran_low_to_s32q(coeff + 4 * i + 512);
+    int32x4_t a3 = load_tran_low_to_s32q(coeff + 4 * i + 768);
+
+    int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    store_s32q_to_tran_low(coeff + 4 * i, c0);
+    store_s32q_to_tran_low(coeff + 4 * i + 256, c1);
+    store_s32q_to_tran_low(coeff + 4 * i + 512, c2);
+    store_s32q_to_tran_low(coeff + 4 * i + 768, c3);
+  } while (++i < 64);
+}
diff --git a/vpx_dsp/arm/highbd_intrapred_neon.c b/vpx_dsp/arm/highbd_intrapred_neon.c
index 6f7e5da76..235cb5b99 100644
--- a/vpx_dsp/arm/highbd_intrapred_neon.c
+++ b/vpx_dsp/arm/highbd_intrapred_neon.c
@@ -12,23 +12,22 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_4(const uint16_t *ref) {
   const uint16x4_t ref_u16 = vld1_u16(ref);
-  const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16);
-  return vpadd_u16(p0, p0);
+  return horizontal_add_uint16x4(ref_u16);
 }
 
 static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
                                 const uint16x4_t dc) {
-  const uint16x4_t dc_dup = vdup_lane_u16(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_u16(dst, dc_dup);
+    vst1_u16(dst, dc);
   }
 }
 
@@ -37,21 +36,17 @@ void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                       const uint16_t *left, int bd) {
   const uint16x4_t a = vld1_u16(above);
   const uint16x4_t l = vld1_u16(left);
-  uint16x4_t sum;
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x4(vadd_u16(a, l));
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 3);
   (void)bd;
-  sum = vadd_u16(a, l);
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(left);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)above;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -60,8 +55,8 @@ void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 2);
+  const uint16_t sum = dc_sum_4(above);
+  const uint16x4_t dc = vrshr_n_u16(vdup_n_u16(sum), 2);
   (void)left;
   (void)bd;
   dc_store_4x4(dst, stride, dc);
@@ -79,19 +74,16 @@ void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) {
+static INLINE uint16_t dc_sum_8(const uint16_t *ref) {
   const uint16x8_t ref_u16 = vld1q_u16(ref);
-  uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+  return horizontal_add_uint16x8(ref_u16);
 }
 
 static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride,
-                                const uint16x4_t dc) {
-  const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0);
+                                const uint16x8_t dc) {
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1q_u16(dst, dc_dup);
+    vst1q_u16(dst, dc);
   }
 }
 
@@ -101,20 +93,17 @@ void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
   const uint16x8_t above_u16 = vld1q_u16(above);
   const uint16x8_t left_u16 = vld1q_u16(left);
   const uint16x8_t p0 = vaddq_u16(above_u16, left_u16);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint16x4_t dc;
+  const uint16_t sum = horizontal_add_uint16x8(p0);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)bd;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vrshr_n_u16(sum, 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                            const uint16_t *above,
                                            const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -123,8 +112,8 @@ void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  const uint16_t sum = dc_sum_8(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   (void)bd;
   dc_store_8x8(dst, stride, dc);
@@ -133,7 +122,7 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_8x8(dst, stride, dc);
@@ -142,47 +131,43 @@ void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) {
-  const uint16x8x2_t ref_u16 = vld2q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint16_t *ref) {
+  const uint16x8_t ref_u16_0 = vld1q_u16(ref + 0);
+  const uint16x8_t ref_u16_1 = vld1q_u16(ref + 8);
+  const uint16x8_t p0 = vaddq_u16(ref_u16_0, ref_u16_1);
+  return horizontal_add_uint16x8(p0);
 }
 
 static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst2q_u16(dst, dc_dup);
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
   }
 }
 
 void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x2_t a = vld2q_u16(above);
-  const uint16x8x2_t l = vld2q_u16(left);
-  const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t pa = vaddq_u16(a0, a1);
+  const uint16x8_t pl = vaddq_u16(l0, l1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum;
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)bd;
-  pal1 = vpadd_u16(pal1, pal1);
-  sum = vpaddl_u16(pal1);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(left);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)above;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -191,8 +176,8 @@ void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint16x4_t dc = vrshr_n_u16(sum, 4);
+  const uint16_t sum = dc_sum_16(above);
+  const uint16x8_t dc = vrshrq_n_u16(vdupq_n_u16(sum), 4);
   (void)left;
   (void)bd;
   dc_store_16x16(dst, stride, dc);
@@ -201,7 +186,7 @@ void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -210,56 +195,58 @@ void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) {
-  const uint16x8x4_t r = vld4q_u16(ref);
-  const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]);
-  const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]);
+static INLINE uint32_t dc_sum_32(const uint16_t *ref) {
+  const uint16x8_t r0 = vld1q_u16(ref + 0);
+  const uint16x8_t r1 = vld1q_u16(ref + 8);
+  const uint16x8_t r2 = vld1q_u16(ref + 16);
+  const uint16x8_t r3 = vld1q_u16(ref + 24);
+  const uint16x8_t p0 = vaddq_u16(r0, r1);
+  const uint16x8_t p1 = vaddq_u16(r2, r3);
   const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpaddl_u16(sum);
+  return horizontal_add_uint16x8(p2);
 }
 
 static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride,
-                                  const uint16x4_t dc) {
-  uint16x8x2_t dc_dup;
+                                  const uint16x8_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0);
-
   for (i = 0; i < 32; ++i) {
-    vst2q_u16(dst, dc_dup);
-    dst += 16;
-    vst2q_u16(dst, dc_dup);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, dc);
+    vst1q_u16(dst + 8, dc);
+    vst1q_u16(dst + 16, dc);
+    vst1q_u16(dst + 24, dc);
+    dst += stride;
   }
 }
 
 void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                         const uint16_t *above,
                                         const uint16_t *left, int bd) {
-  const uint16x8x4_t a = vld4q_u16(above);
-  const uint16x8x4_t l = vld4q_u16(left);
-  const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]);
-  const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]);
-  const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]);
-  const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]);
+  const uint16x8_t a0 = vld1q_u16(above + 0);
+  const uint16x8_t a1 = vld1q_u16(above + 8);
+  const uint16x8_t a2 = vld1q_u16(above + 16);
+  const uint16x8_t a3 = vld1q_u16(above + 24);
+  const uint16x8_t l0 = vld1q_u16(left + 0);
+  const uint16x8_t l1 = vld1q_u16(left + 8);
+  const uint16x8_t l2 = vld1q_u16(left + 16);
+  const uint16x8_t l3 = vld1q_u16(left + 24);
+  const uint16x8_t pa0 = vaddq_u16(a0, a1);
+  const uint16x8_t pa1 = vaddq_u16(a2, a3);
+  const uint16x8_t pl0 = vaddq_u16(l0, l1);
+  const uint16x8_t pl1 = vaddq_u16(l2, l3);
   const uint16x8_t pa = vaddq_u16(pa0, pa1);
   const uint16x8_t pl = vaddq_u16(pl0, pl1);
   const uint16x8_t pal0 = vaddq_u16(pa, pl);
-  const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0));
-  uint32x2_t sum = vpaddl_u16(pal1);
-  uint16x4_t dc;
+  const uint32_t sum = horizontal_add_uint16x8(pal0);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 6), 0);
   (void)bd;
-  sum = vpadd_u32(sum, sum);
-  dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6));
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                              const uint16_t *above,
                                              const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(left);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(left);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)above;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -268,8 +255,8 @@ void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint32x2_t sum = dc_sum_32(above);
-  const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5));
+  const uint32_t sum = dc_sum_32(above);
+  const uint16x8_t dc = vdupq_lane_u16(vrshrn_n_u32(vdupq_n_u32(sum), 5), 0);
   (void)left;
   (void)bd;
   dc_store_32x32(dst, stride, dc);
@@ -278,7 +265,7 @@ void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                             const uint16_t *above,
                                             const uint16_t *left, int bd) {
-  const uint16x4_t dc = vdup_n_u16(1 << (bd - 1));
+  const uint16x8_t dc = vdupq_n_u16(1 << (bd - 1));
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -289,166 +276,1304 @@ void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t ABCDEFGH = vld1q_u16(above);
-  const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1);
-  const uint16x8_t CDEFGH00 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00);
-  const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0);
-  const uint16x4_t avg2_low = vget_low_u16(avg2);
-  const uint16x4_t avg2_high = vget_high_u16(avg2);
-  const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1);
-  const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2);
-  const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3);
+  uint16x8_t a0, a1, a2, d0;
+  uint16_t a7;
   (void)left;
   (void)bd;
-  vst1_u16(dst, avg2_low);
-  dst += stride;
-  vst1_u16(dst, r1);
-  dst += stride;
-  vst1_u16(dst, r2);
-  dst += stride;
-  vst1_u16(dst, r3);
-  vst1q_lane_u16(dst + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride,
-                               const uint16x8_t above_right, uint16x8_t *row) {
-  *row = vextq_u16(*row, above_right, 1);
-  vst1q_u16(*dst, *row);
-  *dst += stride;
+  a0 = vld1q_u16(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vextq_u16(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vextq_u16(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  vst1_u16(dst + 0 * stride, vget_low_u16(d0));
+  vst1_u16(dst + 1 * stride, vget_low_u16(vextq_u16(d0, d0, 1)));
+  vst1_u16(dst + 2 * stride, vget_low_u16(vextq_u16(d0, d0, 2)));
+  vst1_u16(dst + 3 * stride, vget_low_u16(vextq_u16(d0, d0, 3)));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8_t A0 = vld1q_u16(above);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3);
-  const uint16x8_t A1 = vld1q_u16(above + 1);
-  const uint16x8_t A2 = vld1q_u16(above + 2);
-  const uint16x8_t avg1 = vhaddq_u16(A0, A2);
-  uint16x8_t row = vrhaddq_u16(avg1, A1);
+  uint16x8_t ax0, a0, a1, a7, d0;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1q_u16(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride,
-                                const uint16x8_t above_right, uint16x8_t *row_0,
-                                uint16x8_t *row_1) {
-  *row_0 = vextq_u16(*row_0, *row_1, 1);
-  *row_1 = vextq_u16(*row_1, above_right, 1);
-  vst1q_u16(*dst, *row_0);
-  *dst += 8;
-  vst1q_u16(*dst, *row_1);
-  *dst += stride - 8;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_dup_u16(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1q_u16(dst + 0 * stride, vextq_u16(d0, a7, 1));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d0, a7, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0, a7, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d0, a7, 4));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0, a7, 5));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d0, a7, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0, a7, 7));
+  vst1q_u16(dst + 7 * stride, a7);
 }
 
 void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, d0[2];
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  vst1q_u16(dst + 8, row_1);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  d45_store_16(&dst, stride, above_right, &row_0, &row_1);
-  vst1q_u16(dst, above_right);
-  vst1q_u16(dst + 8, above_right);
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_dup_u16(above + 15);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  // We have one unused lane here to leave room to shift in above[15] in the
+  // last lane:
+  // d0[0][1] = x (don't care)
+  // d0[0][1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[0][7] = AVG3(above[6], above[7], above[8]);
+  // d0[1][0] = AVG3(above[7], above[8], above[9]);
+  // ...
+  // d0[1][7] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+
+  // Incrementally shift in duplicates of above[15].
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 7 * stride + 0, d0[1]);
+  vst1q_u16(dst + 7 * stride + 8, a15);
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[1], a15, 1));
+  vst1q_u16(dst + 8 * stride + 8, a15);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d0[1], a15, 2));
+  vst1q_u16(dst + 9 * stride + 8, a15);
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[1], a15, 3));
+  vst1q_u16(dst + 10 * stride + 8, a15);
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d0[1], a15, 4));
+  vst1q_u16(dst + 11 * stride + 8, a15);
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[1], a15, 5));
+  vst1q_u16(dst + 12 * stride + 8, a15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d0[1], a15, 6));
+  vst1q_u16(dst + 13 * stride + 8, a15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[1], a15, 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, a15);
+  vst1q_u16(dst + 15 * stride + 8, a15);
 }
 
 void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
-  const uint16x8_t A0_0 = vld1q_u16(above);
-  const uint16x8_t A0_1 = vld1q_u16(above + 8);
-  const uint16x8_t A0_2 = vld1q_u16(above + 16);
-  const uint16x8_t A0_3 = vld1q_u16(above + 24);
-  const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3);
-  const uint16x8_t A1_0 = vld1q_u16(above + 1);
-  const uint16x8_t A1_1 = vld1q_u16(above + 9);
-  const uint16x8_t A1_2 = vld1q_u16(above + 17);
-  const uint16x8_t A1_3 = vld1q_u16(above + 25);
-  const uint16x8_t A2_0 = vld1q_u16(above + 2);
-  const uint16x8_t A2_1 = vld1q_u16(above + 10);
-  const uint16x8_t A2_2 = vld1q_u16(above + 18);
-  const uint16x8_t A2_3 = vld1q_u16(above + 26);
-  const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0);
-  const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1);
-  const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2);
-  const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3);
-  uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0);
-  uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1);
-  uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2);
-  uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3);
+  uint16x8_t ax0, a0, a1, a7, a8, a9, a15, a16, a17, a23, a24, a25, a31, d0[4];
   int i;
   (void)left;
   (void)bd;
 
-  vst1q_u16(dst, row_0);
-  dst += 8;
-  vst1q_u16(dst, row_1);
-  dst += 8;
-  vst1q_u16(dst, row_2);
-  dst += 8;
-  vst1q_u16(dst, row_3);
-  dst += stride - 24;
-
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u16(row_0, row_1, 1);
-    row_1 = vextq_u16(row_1, row_2, 1);
-    row_2 = vextq_u16(row_2, row_3, 1);
-    row_3 = vextq_u16(row_3, above_right, 1);
-    vst1q_u16(dst, row_0);
-    dst += 8;
-    vst1q_u16(dst, row_1);
-    dst += 8;
-    vst1q_u16(dst, row_2);
-    dst += 8;
-    vst1q_u16(dst, row_3);
-    dst += stride - 24;
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a31 = vld1q_dup_u16(above + 31);
+
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vextq_u16(a0, a0, 7);
+
+  d0[0] = vrhaddq_u16(vhaddq_u16(ax0, a1), a0);
+  d0[1] = vrhaddq_u16(vhaddq_u16(a7, a9), a8);
+  d0[2] = vrhaddq_u16(vhaddq_u16(a15, a17), a16);
+  d0[3] = vrhaddq_u16(vhaddq_u16(a23, a25), a24);
+
+  for (i = 0; i < 32; ++i) {
+    d0[0] = vextq_u16(d0[0], d0[1], 1);
+    d0[1] = vextq_u16(d0[1], d0[2], 1);
+    d0[2] = vextq_u16(d0[2], d0[3], 1);
+    d0[3] = vextq_u16(d0[3], a31, 1);
+    vst1q_u16(dst + 0, d0[0]);
+    vst1q_u16(dst + 8, d0[1]);
+    vst1q_u16(dst + 16, d0[2]);
+    vst1q_u16(dst + 24, d0[3]);
+    dst += stride;
   }
+}
 
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
-  dst += 8;
-  vst1q_u16(dst, above_right);
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d63_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x4_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1_u16(above + 0);
+  a1 = vld1_u16(above + 1);
+  a2 = vld1_u16(above + 2);
+  a3 = vld1_u16(above + 3);
+
+  d0 = vrhadd_u16(a0, a1);
+  d1 = vrhadd_u16(vhadd_u16(a0, a2), a1);
+  d2 = vrhadd_u16(a1, a2);
+  d3 = vrhadd_u16(vhadd_u16(a1, a3), a2);
+
+  // Note that here we are performing a full avg calculation for the final
+  // elements rather than storing a duplicate of above[3], which differs
+  // (correctly) from the general scheme employed by the bs={8,16,32}
+  // implementations in order to match the original C implementation.
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, d2);
+  vst1_u16(dst + 3 * stride, d3);
+}
+
+void vpx_highbd_d63_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  uint16x8_t a0, a1, a2, a7, d0, d1, d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a7 = vld1q_dup_u16(above + 7);
+
+  d0 = vrhaddq_u16(a0, a1);
+  d1 = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+
+  // We want to store:
+  // stride=0 [ d0[0], d0[1], d0[2], d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ d0[1], d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7] ]
+  // stride=3 [ d1[1], d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7] ]
+  // stride=4 [ d0[2], d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7] ]
+  // stride=5 [ d1[2], d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7] ]
+  // stride=6 [ d0[3], d0[4], d0[5], d0[6],  a[7],  a[7],  a[7],  a[7] ]
+  // stride=7 [ d1[3], d1[4], d1[5], d1[6],  a[7],  a[7],  a[7],  a[7] ]
+  // Note in particular that d0[7] and d1[7] are only ever referenced in the
+  // stride=0 and stride=1 cases respectively, and in later strides are
+  // replaced by a copy of above[7]. These are equivalent if for i>7,
+  // above[i]==above[7], however that is not always the case.
+
+  // Strip out d0[7] and d1[7] so that we can replace it with an additional
+  // copy of above[7], the first vector here doesn't matter so just reuse
+  // d0/d1.
+  d0_ext = vextq_u16(d0, d0, 7);
+  d1_ext = vextq_u16(d1, d1, 7);
+
+  // Shuffle in duplicates of above[7] and store.
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(d0_ext, a7, 2));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d1_ext, a7, 2));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d0_ext, a7, 3));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d1_ext, a7, 3));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d0_ext, a7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d1_ext, a7, 4));
+}
+
+void vpx_highbd_d63_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a15, d0[2], d1[2], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a15 = vld1q_dup_u16(above + 15);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[1], d0[1], 7);
+  d1_ext = vextq_u16(d1[1], d1[1], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_ext, a15, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_ext, a15, 2));
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_ext, a15, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_ext, a15, 3));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_ext, a15, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_ext, a15, 4));
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_ext, a15, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_ext, a15, 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_ext, a15, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_ext, a15, 6));
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_ext, a15, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_ext, a15, 7));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, a15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, a15);
+}
+
+void vpx_highbd_d63_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                         const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  // See vpx_highbd_d63_predictor_8x8_neon for details on the implementation.
+  uint16x8_t a0, a1, a2, a8, a9, a10, a16, a17, a18, a24, a25, a26, a31, d0[4],
+      d1[4], d0_ext, d1_ext;
+  (void)left;
+  (void)bd;
+
+  a0 = vld1q_u16(above + 0);
+  a1 = vld1q_u16(above + 1);
+  a2 = vld1q_u16(above + 2);
+  a8 = vld1q_u16(above + 8);
+  a9 = vld1q_u16(above + 9);
+  a10 = vld1q_u16(above + 10);
+  a16 = vld1q_u16(above + 16);
+  a17 = vld1q_u16(above + 17);
+  a18 = vld1q_u16(above + 18);
+  a24 = vld1q_u16(above + 24);
+  a25 = vld1q_u16(above + 25);
+  a26 = vld1q_u16(above + 26);
+  a31 = vld1q_dup_u16(above + 31);
+
+  d0[0] = vrhaddq_u16(a0, a1);
+  d0[1] = vrhaddq_u16(a8, a9);
+  d0[2] = vrhaddq_u16(a16, a17);
+  d0[3] = vrhaddq_u16(a24, a25);
+  d1[0] = vrhaddq_u16(vhaddq_u16(a0, a2), a1);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a8, a10), a9);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a16, a18), a17);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a24, a26), a25);
+
+  // Strip out the final element of d0/d1 so that we can replace it with an
+  // additional copy of above[7], the first vector here doesn't matter so just
+  // reuse the same vector.
+  d0_ext = vextq_u16(d0[3], d0[3], 7);
+  d1_ext = vextq_u16(d1[3], d1[3], 7);
+
+  // Shuffle in duplicates of above[7] and store. Note that cases involving
+  // {d0,d1}_ext require an extra shift to undo the shifting out of the final
+  // element from above.
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1_ext, a31, 2));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1_ext, a31, 3));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1_ext, a31, 4));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1_ext, a31, 5));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1_ext, a31, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1_ext, a31, 7));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 14 * stride + 24, a31);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 15 * stride + 24, a31);
+
+  vst1q_u16(dst + 16 * stride + 0, d0[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[2]);
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d0_ext, a31, 1));
+  vst1q_u16(dst + 16 * stride + 24, a31);
+  vst1q_u16(dst + 17 * stride + 0, d1[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[2]);
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d1_ext, a31, 1));
+  vst1q_u16(dst + 17 * stride + 24, a31);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0_ext, a31, 2));
+  vst1q_u16(dst + 18 * stride + 24, a31);
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d1[2], d1[3], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1_ext, a31, 2));
+  vst1q_u16(dst + 19 * stride + 24, a31);
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0_ext, a31, 3));
+  vst1q_u16(dst + 20 * stride + 24, a31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d1[2], d1[3], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1_ext, a31, 3));
+  vst1q_u16(dst + 21 * stride + 24, a31);
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0_ext, a31, 4));
+  vst1q_u16(dst + 22 * stride + 24, a31);
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1_ext, a31, 4));
+  vst1q_u16(dst + 23 * stride + 24, a31);
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0_ext, a31, 5));
+  vst1q_u16(dst + 24 * stride + 24, a31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d1[2], d1[3], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1_ext, a31, 5));
+  vst1q_u16(dst + 25 * stride + 24, a31);
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0_ext, a31, 6));
+  vst1q_u16(dst + 26 * stride + 24, a31);
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1_ext, a31, 6));
+  vst1q_u16(dst + 27 * stride + 24, a31);
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0_ext, a31, 7));
+  vst1q_u16(dst + 28 * stride + 24, a31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d1[2], d1[3], 6));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1_ext, a31, 7));
+  vst1q_u16(dst + 29 * stride + 24, a31);
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 30 * stride + 16, a31);
+  vst1q_u16(dst + 30 * stride + 24, a31);
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 31 * stride + 16, a31);
+  vst1q_u16(dst + 31 * stride + 24, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d117_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left + 0);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(az, a0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+
+  col0 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+  col0_even = vdup_lane_u16(col0, 0);
+  col0_odd = vdup_lane_u16(col0, 1);
+
+  vst1_u16(dst + 0 * stride, d0);
+  vst1_u16(dst + 1 * stride, d1);
+  vst1_u16(dst + 2 * stride, vext_u16(col0_even, d0, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(col0_odd, d1, 3));
+}
+
+void vpx_highbd_d117_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, col0, col0_even, col0_odd, d0, d1;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhaddq_u16(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[0] = AVG3(left[6], left[7], left[8])
+  col0 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0 = vrev64q_u16(vextq_u16(col0, col0, 4));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzpq_u16(col0, col0).val[1];
+  col0_odd = vuzpq_u16(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1q_u16(dst + 0 * stride, d0);
+  vst1q_u16(dst + 1 * stride, d1);
+  vst1q_u16(dst + 2 * stride, vextq_u16(col0_even, d0, 7));
+  vst1q_u16(dst + 3 * stride, vextq_u16(col0_odd, d1, 7));
+  vst1q_u16(dst + 4 * stride, vextq_u16(col0_even, d0, 6));
+  vst1q_u16(dst + 5 * stride, vextq_u16(col0_odd, d1, 6));
+  vst1q_u16(dst + 6 * stride, vextq_u16(col0_even, d0, 5));
+  vst1q_u16(dst + 7 * stride, vextq_u16(col0_odd, d1, 5));
+}
+
+void vpx_highbd_d117_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, col0_lo,
+      col0_hi, col0_even, col0_odd, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0_lo = vrhaddq_u16(az, a0);
+  d0_hi = vrhaddq_u16(a7, a8);
+  d1_lo = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1_hi = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+
+  col0_lo = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0_hi = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  // Reverse within each vector, then swap the array indices in the uzp to
+  // complete the reversal across all 16 elements.
+  col0_lo = vrev64q_u16(vextq_u16(col0_lo, col0_lo, 4));
+  col0_hi = vrev64q_u16(vextq_u16(col0_hi, col0_hi, 4));
+  col0_even = vuzpq_u16(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u16(col0_hi, col0_lo).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0_lo);
+  vst1q_u16(dst + 0 * stride + 8, d0_hi);
+  vst1q_u16(dst + 1 * stride + 0, d1_lo);
+  vst1q_u16(dst + 1 * stride + 8, d1_hi);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even, d0_lo, 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0_lo, d0_hi, 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd, d1_lo, 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1_lo, d1_hi, 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even, d0_lo, 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0_lo, d0_hi, 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd, d1_lo, 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1_lo, d1_hi, 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even, d0_lo, 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0_lo, d0_hi, 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd, d1_lo, 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1_lo, d1_hi, 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even, d0_lo, 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0_lo, d0_hi, 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd, d1_lo, 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1_lo, d1_hi, 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even, d0_lo, 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0_lo, d0_hi, 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd, d1_lo, 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1_lo, d1_hi, 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even, d0_lo, 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0_lo, d0_hi, 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd, d1_lo, 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1_lo, d1_hi, 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even, d0_lo, 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0_lo, d0_hi, 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd, d1_lo, 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1_lo, d1_hi, 1));
+}
+
+void vpx_highbd_d117_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], col0[4],
+      col0_even[2], col0_odd[2];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ..., left[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  l25 = vld1q_u16(left + 25);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ..., left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(az, a0);
+  d0[1] = vrhaddq_u16(a7, a8);
+  d0[2] = vrhaddq_u16(a15, a16);
+  d0[3] = vrhaddq_u16(a23, a24);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  col0[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  col0[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  col0[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  col0[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  // Reverse within each vector, then swap the array indices in both the uzp
+  // and the col0_{even,odd} assignment to complete the reversal across all
+  // 32-elements.
+  col0[0] = vrev64q_u16(vextq_u16(col0[0], col0[0], 4));
+  col0[1] = vrev64q_u16(vextq_u16(col0[1], col0[1], 4));
+  col0[2] = vrev64q_u16(vextq_u16(col0[2], col0[2], 4));
+  col0[3] = vrev64q_u16(vextq_u16(col0[3], col0[3], 4));
+
+  col0_even[1] = vuzpq_u16(col0[1], col0[0]).val[1];
+  col0_even[0] = vuzpq_u16(col0[3], col0[2]).val[1];
+  col0_odd[1] = vuzpq_u16(col0[1], col0[0]).val[0];
+  col0_odd[0] = vuzpq_u16(col0[3], col0[2]).val[0];
+
+  vst1q_u16(dst + 0 * stride + 0, d0[0]);
+  vst1q_u16(dst + 0 * stride + 8, d0[1]);
+  vst1q_u16(dst + 0 * stride + 16, d0[2]);
+  vst1q_u16(dst + 0 * stride + 24, d0[3]);
+  vst1q_u16(dst + 1 * stride + 0, d1[0]);
+  vst1q_u16(dst + 1 * stride + 8, d1[1]);
+  vst1q_u16(dst + 1 * stride + 16, d1[2]);
+  vst1q_u16(dst + 1 * stride + 24, d1[3]);
+
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d0[2], d0[3], 7));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d0[2], d0[3], 6));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[1], d1[2], 6));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[2], d1[3], 6));
+
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d0[2], d0[3], 5));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d0[2], d0[3], 4));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d1[1], d1[2], 4));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[2], d1[3], 4));
+
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d0[2], d0[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d0[2], d0[3], 2));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d1[1], d1[2], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d1[2], d1[3], 2));
+
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d0[2], d0[3], 1));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, col0_even[1]);
+  vst1q_u16(dst + 16 * stride + 8, d0[0]);
+  vst1q_u16(dst + 16 * stride + 16, d0[1]);
+  vst1q_u16(dst + 16 * stride + 24, d0[2]);
+  vst1q_u16(dst + 17 * stride + 0, col0_odd[1]);
+  vst1q_u16(dst + 17 * stride + 8, d1[0]);
+  vst1q_u16(dst + 17 * stride + 16, d1[1]);
+  vst1q_u16(dst + 17 * stride + 24, d1[2]);
+
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 7));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(col0_even[1], d0[0], 7));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d0[0], d0[1], 7));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d0[1], d0[2], 7));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 7));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(col0_odd[1], d1[0], 7));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 6));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(col0_even[1], d0[0], 6));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d0[0], d0[1], 6));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d0[1], d0[2], 6));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 6));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(col0_odd[1], d1[0], 6));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d1[0], d1[1], 6));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d1[1], d1[2], 6));
+
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 5));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(col0_even[1], d0[0], 5));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d0[0], d0[1], 5));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d0[1], d0[2], 5));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 5));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(col0_odd[1], d1[0], 5));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 4));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(col0_even[1], d0[0], 4));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d0[0], d0[1], 4));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d0[1], d0[2], 4));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 4));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(col0_odd[1], d1[0], 4));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d1[0], d1[1], 4));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d1[1], d1[2], 4));
+
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(col0_even[1], d0[0], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d0[0], d0[1], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d0[1], d0[2], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 3));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(col0_odd[1], d1[0], 3));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 2));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(col0_even[1], d0[0], 2));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d0[0], d0[1], 2));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d0[1], d0[2], 2));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(col0_odd[1], d1[0], 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d1[0], d1[1], 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d1[1], d1[2], 2));
+
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(col0_even[0], col0_even[1], 1));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(col0_even[1], d0[0], 1));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d0[0], d0[1], 1));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d0[1], d0[2], 1));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(col0_odd[0], col0_odd[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(col0_odd[1], d1[0], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_highbd_d153_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x4_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d20_lo, d20_hi;
+  (void)bd;
+
+  az = vld1_u16(above - 1);
+  a0 = vld1_u16(above + 0);
+  // [ left[0], above[-1], above[0], above[1] ]
+  l0az = vext_u16(vld1_dup_u16(left), az, 3);
+
+  l0 = vld1_u16(left);
+  // The last lane here is unused, reading left[4] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], left[2], left[3], x ]
+  l1 = vext_u16(l0, l0, 1);
+  // [ above[-1], left[0], left[1], left[2] ]
+  azl0 = vext_u16(vld1_dup_u16(above - 1), l0, 3);
+
+  d0 = vrhadd_u16(azl0, l0);
+  d1 = vrhadd_u16(vhadd_u16(l0az, a0), az);
+  d2 = vrhadd_u16(vhadd_u16(azl0, l1), l0);
+
+  d20_lo = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[0];
+  d20_hi = vzip_u16(vrev64_u16(d2), vrev64_u16(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1] ]
+  vst1_u16(dst + 0 * stride, vext_u16(d20_hi, d1, 3));
+  vst1_u16(dst + 1 * stride, vext_u16(d20_hi, d1, 1));
+  vst1_u16(dst + 2 * stride, vext_u16(d20_lo, d20_hi, 3));
+  vst1_u16(dst + 3 * stride, vext_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d0_rev, d2_rev, d20_lo,
+      d20_hi;
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vextq_u16(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhaddq_u16(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[7] = AVG3(left[6], left[7], left[8])
+  d2 = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end:
+  d0_rev = vrev64q_u16(vextq_u16(d0, d0, 4));
+  d2_rev = vrev64q_u16(vextq_u16(d2, d2, 4));
+
+  d20_lo = vzipq_u16(d2_rev, d0_rev).val[0];
+  d20_hi = vzipq_u16(d2_rev, d0_rev).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1q_u16(dst + 0 * stride, vextq_u16(d20_hi, d1, 7));
+  vst1q_u16(dst + 1 * stride, vextq_u16(d20_hi, d1, 5));
+  vst1q_u16(dst + 2 * stride, vextq_u16(d20_hi, d1, 3));
+  vst1q_u16(dst + 3 * stride, vextq_u16(d20_hi, d1, 1));
+  vst1q_u16(dst + 4 * stride, vextq_u16(d20_lo, d20_hi, 7));
+  vst1q_u16(dst + 5 * stride, vextq_u16(d20_lo, d20_hi, 5));
+  vst1q_u16(dst + 6 * stride, vextq_u16(d20_lo, d20_hi, 3));
+  vst1q_u16(dst + 7 * stride, vextq_u16(d20_lo, d20_hi, 1));
+}
+
+void vpx_highbd_d153_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, l0az, l0, l1, l7, l8, l9, azl0, d0[2], d1[2],
+      d2[2], d20[4];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[8] to avoid needing to
+  // materialize a zero:
+  // [ left[9], ... , left[15], x ]
+  l9 = vextq_u16(l8, l8, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+
+  d20[0] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[1] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[2] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[3] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[3], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[3], d1[0], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[3], d1[0], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[3], d1[0], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[3], d1[0], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+}
+
+void vpx_highbd_d153_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  // See vpx_highbd_d153_predictor_8x8_neon for details on the implementation.
+  uint16x8_t az, a0, a6, a7, a8, a14, a15, a16, a22, a23, a24, l0az, l0, l1, l7,
+      l8, l9, l15, l16, l17, l23, l24, l25, azl0, d0[4], d1[4], d2[4], d20[8];
+  (void)bd;
+
+  az = vld1q_u16(above - 1);
+  a0 = vld1q_u16(above + 0);
+  a6 = vld1q_u16(above + 6);
+  a7 = vld1q_u16(above + 7);
+  a8 = vld1q_u16(above + 8);
+  a14 = vld1q_u16(above + 14);
+  a15 = vld1q_u16(above + 15);
+  a16 = vld1q_u16(above + 16);
+  a22 = vld1q_u16(above + 22);
+  a23 = vld1q_u16(above + 23);
+  a24 = vld1q_u16(above + 24);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u16(vld1q_dup_u16(left), az, 7);
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l7 = vld1q_u16(left + 7);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l15 = vld1q_u16(left + 15);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l23 = vld1q_u16(left + 23);
+  l24 = vld1q_u16(left + 24);
+  // The last lane here is unused, reading left[32] could cause a buffer
+  // over-read, so just fill with a duplicate of left[24] to avoid needing to
+  // materialize a zero:
+  // [ left[25], ... , left[31], x ]
+  l25 = vextq_u16(l24, l24, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u16(vld1q_dup_u16(above - 1), l0, 7);
+
+  d0[0] = vrhaddq_u16(azl0, l0);
+  d0[1] = vrhaddq_u16(l7, l8);
+  d0[2] = vrhaddq_u16(l15, l16);
+  d0[3] = vrhaddq_u16(l23, l24);
+
+  d1[0] = vrhaddq_u16(vhaddq_u16(l0az, a0), az);
+  d1[1] = vrhaddq_u16(vhaddq_u16(a6, a8), a7);
+  d1[2] = vrhaddq_u16(vhaddq_u16(a14, a16), a15);
+  d1[3] = vrhaddq_u16(vhaddq_u16(a22, a24), a23);
+
+  d2[0] = vrhaddq_u16(vhaddq_u16(azl0, l1), l0);
+  d2[1] = vrhaddq_u16(vhaddq_u16(l7, l9), l8);
+  d2[2] = vrhaddq_u16(vhaddq_u16(l15, l17), l16);
+  d2[3] = vrhaddq_u16(vhaddq_u16(l23, l25), l24);
+
+  d0[0] = vrev64q_u16(vextq_u16(d0[0], d0[0], 4));
+  d0[1] = vrev64q_u16(vextq_u16(d0[1], d0[1], 4));
+  d0[2] = vrev64q_u16(vextq_u16(d0[2], d0[2], 4));
+  d0[3] = vrev64q_u16(vextq_u16(d0[3], d0[3], 4));
+  d2[0] = vrev64q_u16(vextq_u16(d2[0], d2[0], 4));
+  d2[1] = vrev64q_u16(vextq_u16(d2[1], d2[1], 4));
+  d2[2] = vrev64q_u16(vextq_u16(d2[2], d2[2], 4));
+  d2[3] = vrev64q_u16(vextq_u16(d2[3], d2[3], 4));
+
+  d20[0] = vzipq_u16(d2[3], d0[3]).val[0];
+  d20[1] = vzipq_u16(d2[3], d0[3]).val[1];
+  d20[2] = vzipq_u16(d2[2], d0[2]).val[0];
+  d20[3] = vzipq_u16(d2[2], d0[2]).val[1];
+  d20[4] = vzipq_u16(d2[1], d0[1]).val[0];
+  d20[5] = vzipq_u16(d2[1], d0[1]).val[1];
+  d20[6] = vzipq_u16(d2[0], d0[0]).val[0];
+  d20[7] = vzipq_u16(d2[0], d0[0]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 0 * stride + 8, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 0 * stride + 16, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 0 * stride + 24, vextq_u16(d1[2], d1[3], 7));
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(d1[2], d1[3], 5));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(d1[2], d1[3], 3));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(d1[1], d1[2], 1));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(d1[2], d1[3], 1));
+
+  vst1q_u16(dst + 4 * stride + 0, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 4 * stride + 8, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 4 * stride + 16, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 4 * stride + 24, vextq_u16(d1[1], d1[2], 7));
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(d1[1], d1[2], 5));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(d1[1], d1[2], 3));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(d1[0], d1[1], 1));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(d1[1], d1[2], 1));
+
+  vst1q_u16(dst + 8 * stride + 0, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 8 * stride + 8, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 8 * stride + 16, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 8 * stride + 24, vextq_u16(d1[0], d1[1], 7));
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(d1[0], d1[1], 5));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(d1[0], d1[1], 3));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(d20[7], d1[0], 1));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(d1[0], d1[1], 1));
+
+  vst1q_u16(dst + 12 * stride + 0, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 12 * stride + 8, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 12 * stride + 16, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 12 * stride + 24, vextq_u16(d20[7], d1[0], 7));
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(d20[7], d1[0], 5));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(d20[7], d1[0], 3));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(d20[6], d20[7], 1));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(d20[7], d1[0], 1));
+
+  vst1q_u16(dst + 16 * stride + 0, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 16 * stride + 8, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 16 * stride + 16, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 16 * stride + 24, vextq_u16(d20[6], d20[7], 7));
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(d20[6], d20[7], 5));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(d20[6], d20[7], 3));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(d20[5], d20[6], 1));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(d20[6], d20[7], 1));
+
+  vst1q_u16(dst + 20 * stride + 0, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 20 * stride + 8, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 20 * stride + 16, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 20 * stride + 24, vextq_u16(d20[5], d20[6], 7));
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(d20[5], d20[6], 5));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(d20[5], d20[6], 3));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(d20[4], d20[5], 1));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(d20[5], d20[6], 1));
+
+  vst1q_u16(dst + 24 * stride + 0, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 24 * stride + 8, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 24 * stride + 16, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 24 * stride + 24, vextq_u16(d20[4], d20[5], 7));
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(d20[4], d20[5], 5));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(d20[4], d20[5], 3));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(d20[3], d20[4], 1));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(d20[4], d20[5], 1));
+
+  vst1q_u16(dst + 28 * stride + 0, vextq_u16(d20[0], d20[1], 7));
+  vst1q_u16(dst + 28 * stride + 8, vextq_u16(d20[1], d20[2], 7));
+  vst1q_u16(dst + 28 * stride + 16, vextq_u16(d20[2], d20[3], 7));
+  vst1q_u16(dst + 28 * stride + 24, vextq_u16(d20[3], d20[4], 7));
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(d20[0], d20[1], 5));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(d20[1], d20[2], 5));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(d20[2], d20[3], 5));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(d20[3], d20[4], 5));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(d20[0], d20[1], 3));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(d20[1], d20[2], 3));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(d20[2], d20[3], 3));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(d20[3], d20[4], 3));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(d20[0], d20[1], 1));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(d20[1], d20[2], 1));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(d20[2], d20[3], 1));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(d20[3], d20[4], 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -696,6 +1821,311 @@ void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
 
 //------------------------------------------------------------------------------
 
+void vpx_highbd_d207_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x4_t l0, l1, l2, l3, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1_u16(left + 0);
+  l3 = vld1_dup_u16(left + 3);
+
+  // [ left[1], left[2], left[3], left[3] ]
+  l1 = vext_u16(l0, l3, 1);
+  // [ left[2], left[3], left[3], left[3] ]
+  l2 = vext_u16(l0, l3, 2);
+
+  c0 = vrhadd_u16(l0, l1);
+  c1 = vrhadd_u16(vhadd_u16(l0, l2), l1);
+
+  c01_lo = vzip_u16(c0, c1).val[0];
+  c01_hi = vzip_u16(c0, c1).val[1];
+
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  vst1_u16(dst + 0 * stride, c01_lo);
+  vst1_u16(dst + 1 * stride, vext_u16(c01_lo, c01_hi, 2));
+  vst1_u16(dst + 2 * stride, c01_hi);
+  vst1_u16(dst + 3 * stride, vext_u16(c01_hi, l3, 2));
+}
+
+void vpx_highbd_d207_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l7, c0, c1, c01_lo, c01_hi;
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l7 = vld1q_dup_u16(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vextq_u16(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vextq_u16(l0, l7, 2);
+
+  c0 = vrhaddq_u16(l0, l1);
+  c1 = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+
+  c01_lo = vzipq_u16(c0, c1).val[0];
+  c01_hi = vzipq_u16(c0, c1).val[1];
+
+  vst1q_u16(dst + 0 * stride, c01_lo);
+  vst1q_u16(dst + 1 * stride, vextq_u16(c01_lo, c01_hi, 2));
+  vst1q_u16(dst + 2 * stride, vextq_u16(c01_lo, c01_hi, 4));
+  vst1q_u16(dst + 3 * stride, vextq_u16(c01_lo, c01_hi, 6));
+  vst1q_u16(dst + 4 * stride, c01_hi);
+  vst1q_u16(dst + 5 * stride, vextq_u16(c01_hi, l7, 2));
+  vst1q_u16(dst + 6 * stride, vextq_u16(c01_hi, l7, 4));
+  vst1q_u16(dst + 7 * stride, vextq_u16(c01_hi, l7, 6));
+}
+
+void vpx_highbd_d207_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l15, c0[2], c1[2], c01[4];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l15 = vld1q_dup_u16(left + 15);
+
+  l9 = vextq_u16(l8, l15, 1);
+  l10 = vextq_u16(l8, l15, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], l15, 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, l15);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], l15, 2));
+  vst1q_u16(dst + 13 * stride + 8, l15);
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], l15, 4));
+  vst1q_u16(dst + 14 * stride + 8, l15);
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], l15, 6));
+  vst1q_u16(dst + 15 * stride + 8, l15);
+}
+
+void vpx_highbd_d207_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  uint16x8_t l0, l1, l2, l8, l9, l10, l16, l17, l18, l24, l25, l26, l31, c0[4],
+      c1[4], c01[8];
+  (void)above;
+  (void)bd;
+
+  l0 = vld1q_u16(left + 0);
+  l1 = vld1q_u16(left + 1);
+  l2 = vld1q_u16(left + 2);
+  l8 = vld1q_u16(left + 8);
+  l9 = vld1q_u16(left + 9);
+  l10 = vld1q_u16(left + 10);
+  l16 = vld1q_u16(left + 16);
+  l17 = vld1q_u16(left + 17);
+  l18 = vld1q_u16(left + 18);
+  l24 = vld1q_u16(left + 24);
+  l31 = vld1q_dup_u16(left + 31);
+
+  l25 = vextq_u16(l24, l31, 1);
+  l26 = vextq_u16(l24, l31, 2);
+
+  c0[0] = vrhaddq_u16(l0, l1);
+  c0[1] = vrhaddq_u16(l8, l9);
+  c0[2] = vrhaddq_u16(l16, l17);
+  c0[3] = vrhaddq_u16(l24, l25);
+  c1[0] = vrhaddq_u16(vhaddq_u16(l0, l2), l1);
+  c1[1] = vrhaddq_u16(vhaddq_u16(l8, l10), l9);
+  c1[2] = vrhaddq_u16(vhaddq_u16(l16, l18), l17);
+  c1[3] = vrhaddq_u16(vhaddq_u16(l24, l26), l25);
+
+  c01[0] = vzipq_u16(c0[0], c1[0]).val[0];
+  c01[1] = vzipq_u16(c0[0], c1[0]).val[1];
+  c01[2] = vzipq_u16(c0[1], c1[1]).val[0];
+  c01[3] = vzipq_u16(c0[1], c1[1]).val[1];
+  c01[4] = vzipq_u16(c0[2], c1[2]).val[0];
+  c01[5] = vzipq_u16(c0[2], c1[2]).val[1];
+  c01[6] = vzipq_u16(c0[3], c1[3]).val[0];
+  c01[7] = vzipq_u16(c0[3], c1[3]).val[1];
+
+  vst1q_u16(dst + 0 * stride + 0, c01[0]);
+  vst1q_u16(dst + 0 * stride + 8, c01[1]);
+  vst1q_u16(dst + 0 * stride + 16, c01[2]);
+  vst1q_u16(dst + 0 * stride + 24, c01[3]);
+  vst1q_u16(dst + 1 * stride + 0, vextq_u16(c01[0], c01[1], 2));
+  vst1q_u16(dst + 1 * stride + 8, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 1 * stride + 16, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 1 * stride + 24, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 2 * stride + 0, vextq_u16(c01[0], c01[1], 4));
+  vst1q_u16(dst + 2 * stride + 8, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 2 * stride + 16, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 2 * stride + 24, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 3 * stride + 0, vextq_u16(c01[0], c01[1], 6));
+  vst1q_u16(dst + 3 * stride + 8, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 3 * stride + 16, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 3 * stride + 24, vextq_u16(c01[3], c01[4], 6));
+
+  vst1q_u16(dst + 4 * stride + 0, c01[1]);
+  vst1q_u16(dst + 4 * stride + 8, c01[2]);
+  vst1q_u16(dst + 4 * stride + 16, c01[3]);
+  vst1q_u16(dst + 4 * stride + 24, c01[4]);
+  vst1q_u16(dst + 5 * stride + 0, vextq_u16(c01[1], c01[2], 2));
+  vst1q_u16(dst + 5 * stride + 8, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 5 * stride + 16, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 5 * stride + 24, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 6 * stride + 0, vextq_u16(c01[1], c01[2], 4));
+  vst1q_u16(dst + 6 * stride + 8, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 6 * stride + 16, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 6 * stride + 24, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 7 * stride + 0, vextq_u16(c01[1], c01[2], 6));
+  vst1q_u16(dst + 7 * stride + 8, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 7 * stride + 16, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 7 * stride + 24, vextq_u16(c01[4], c01[5], 6));
+
+  vst1q_u16(dst + 8 * stride + 0, c01[2]);
+  vst1q_u16(dst + 8 * stride + 8, c01[3]);
+  vst1q_u16(dst + 8 * stride + 16, c01[4]);
+  vst1q_u16(dst + 8 * stride + 24, c01[5]);
+  vst1q_u16(dst + 9 * stride + 0, vextq_u16(c01[2], c01[3], 2));
+  vst1q_u16(dst + 9 * stride + 8, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 9 * stride + 16, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 9 * stride + 24, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 10 * stride + 0, vextq_u16(c01[2], c01[3], 4));
+  vst1q_u16(dst + 10 * stride + 8, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 10 * stride + 16, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 10 * stride + 24, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 11 * stride + 0, vextq_u16(c01[2], c01[3], 6));
+  vst1q_u16(dst + 11 * stride + 8, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 11 * stride + 16, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 11 * stride + 24, vextq_u16(c01[5], c01[6], 6));
+
+  vst1q_u16(dst + 12 * stride + 0, c01[3]);
+  vst1q_u16(dst + 12 * stride + 8, c01[4]);
+  vst1q_u16(dst + 12 * stride + 16, c01[5]);
+  vst1q_u16(dst + 12 * stride + 24, c01[6]);
+  vst1q_u16(dst + 13 * stride + 0, vextq_u16(c01[3], c01[4], 2));
+  vst1q_u16(dst + 13 * stride + 8, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 13 * stride + 16, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 13 * stride + 24, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 14 * stride + 0, vextq_u16(c01[3], c01[4], 4));
+  vst1q_u16(dst + 14 * stride + 8, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 14 * stride + 16, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 14 * stride + 24, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 15 * stride + 0, vextq_u16(c01[3], c01[4], 6));
+  vst1q_u16(dst + 15 * stride + 8, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 15 * stride + 16, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 15 * stride + 24, vextq_u16(c01[6], c01[7], 6));
+
+  vst1q_u16(dst + 16 * stride + 0, c01[4]);
+  vst1q_u16(dst + 16 * stride + 8, c01[5]);
+  vst1q_u16(dst + 16 * stride + 16, c01[6]);
+  vst1q_u16(dst + 16 * stride + 24, c01[7]);
+  vst1q_u16(dst + 17 * stride + 0, vextq_u16(c01[4], c01[5], 2));
+  vst1q_u16(dst + 17 * stride + 8, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 17 * stride + 16, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 17 * stride + 24, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 18 * stride + 0, vextq_u16(c01[4], c01[5], 4));
+  vst1q_u16(dst + 18 * stride + 8, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 18 * stride + 16, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 18 * stride + 24, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 19 * stride + 0, vextq_u16(c01[4], c01[5], 6));
+  vst1q_u16(dst + 19 * stride + 8, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 19 * stride + 16, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 19 * stride + 24, vextq_u16(c01[7], l31, 6));
+
+  vst1q_u16(dst + 20 * stride + 0, c01[5]);
+  vst1q_u16(dst + 20 * stride + 8, c01[6]);
+  vst1q_u16(dst + 20 * stride + 16, c01[7]);
+  vst1q_u16(dst + 20 * stride + 24, l31);
+  vst1q_u16(dst + 21 * stride + 0, vextq_u16(c01[5], c01[6], 2));
+  vst1q_u16(dst + 21 * stride + 8, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 21 * stride + 16, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 21 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 22 * stride + 0, vextq_u16(c01[5], c01[6], 4));
+  vst1q_u16(dst + 22 * stride + 8, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 22 * stride + 16, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 22 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 23 * stride + 0, vextq_u16(c01[5], c01[6], 6));
+  vst1q_u16(dst + 23 * stride + 8, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 23 * stride + 16, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 23 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 24 * stride + 0, c01[6]);
+  vst1q_u16(dst + 24 * stride + 8, c01[7]);
+  vst1q_u16(dst + 24 * stride + 16, l31);
+  vst1q_u16(dst + 24 * stride + 24, l31);
+  vst1q_u16(dst + 25 * stride + 0, vextq_u16(c01[6], c01[7], 2));
+  vst1q_u16(dst + 25 * stride + 8, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 25 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 25 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 26 * stride + 0, vextq_u16(c01[6], c01[7], 4));
+  vst1q_u16(dst + 26 * stride + 8, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 26 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 26 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 27 * stride + 0, vextq_u16(c01[6], c01[7], 6));
+  vst1q_u16(dst + 27 * stride + 8, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 27 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 27 * stride + 24, vextq_u16(l31, l31, 6));
+
+  vst1q_u16(dst + 28 * stride + 0, c01[7]);
+  vst1q_u16(dst + 28 * stride + 8, l31);
+  vst1q_u16(dst + 28 * stride + 16, l31);
+  vst1q_u16(dst + 28 * stride + 24, l31);
+  vst1q_u16(dst + 29 * stride + 0, vextq_u16(c01[7], l31, 2));
+  vst1q_u16(dst + 29 * stride + 8, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 16, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 29 * stride + 24, vextq_u16(l31, l31, 2));
+  vst1q_u16(dst + 30 * stride + 0, vextq_u16(c01[7], l31, 4));
+  vst1q_u16(dst + 30 * stride + 8, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 16, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 30 * stride + 24, vextq_u16(l31, l31, 4));
+  vst1q_u16(dst + 31 * stride + 0, vextq_u16(c01[7], l31, 6));
+  vst1q_u16(dst + 31 * stride + 8, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 16, vextq_u16(l31, l31, 6));
+  vst1q_u16(dst + 31 * stride + 24, vextq_u16(l31, l31, 6));
+}
+
+//------------------------------------------------------------------------------
+
 void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
                                      const uint16_t *above,
                                      const uint16_t *left, int bd) {
@@ -725,30 +2155,36 @@ void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
 void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row = vld2q_u16(above);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
   int i;
   (void)left;
   (void)bd;
 
-  for (i = 0; i < 16; i++, dst += stride) {
-    vst2q_u16(dst, row);
+  for (i = 0; i < 16; i++) {
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    dst += stride;
   }
 }
 
 void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
-  const uint16x8x2_t row0 = vld2q_u16(above);
-  const uint16x8x2_t row1 = vld2q_u16(above + 16);
+  const uint16x8_t row0 = vld1q_u16(above + 0);
+  const uint16x8_t row1 = vld1q_u16(above + 8);
+  const uint16x8_t row2 = vld1q_u16(above + 16);
+  const uint16x8_t row3 = vld1q_u16(above + 24);
   int i;
   (void)left;
   (void)bd;
 
   for (i = 0; i < 32; i++) {
-    vst2q_u16(dst, row0);
-    dst += 16;
-    vst2q_u16(dst, row1);
-    dst += stride - 16;
+    vst1q_u16(dst + 0, row0);
+    vst1q_u16(dst + 8, row1);
+    vst1q_u16(dst + 16, row2);
+    vst1q_u16(dst + 24, row3);
+    dst += stride;
   }
 }
 
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index b9f72a94c..c2ad34a69 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -13,6 +13,8 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
     const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
@@ -94,26 +96,25 @@ highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
-  int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+  int32x4_t zbin = vmovl_s16(vld1_s16(mb_plane->zbin));
+  int32x4_t round = vmovl_s16(vld1_s16(mb_plane->round));
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
-  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 15);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -164,7 +165,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -174,11 +175,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)n_coeffs;
-  (void)scan;
+#endif  // VPX_ARCH_AARCH64
 }
 
 static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -224,25 +221,25 @@ static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
 }
 
 void vpx_highbd_quantize_b_32x32_neon(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
   // High half has identical elements, but we can reconstruct it from the low
   // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
   // vector
-  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
-  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->zbin)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(mb_plane->round)), 1);
   // Extend the quant, quant_shift vectors to ones of 32-bit elements
   // scale to high-half, so we can use vqdmulhq_s32
-  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
-  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant)), 15);
+  int32x4_t quant_shift =
+      vshlq_n_s32(vmovl_s16(vld1_s16(mb_plane->quant_shift)), 16);
   int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
 
   // Process first 8 values which include a dc component.
@@ -289,7 +286,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -299,9 +296,5 @@ void vpx_highbd_quantize_b_32x32_neon(
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)n_coeffs;
-  (void)scan;
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/highbd_sad4d_neon.c b/vpx_dsp/arm/highbd_sad4d_neon.c
new file mode 100644
index 000000000..a6684b053
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sad4d_neon.c
@@ -0,0 +1,273 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4],
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+static INLINE void highbd_sad16xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0, s1;
+
+    s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *const ref_ptr[4],
+                                         int ref_stride, uint32_t res[4], int w,
+                                         int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3;
+
+      s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void highbd_sad64xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64, h);
+}
+
+static INLINE void highbd_sad32xhx4d_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *const ref_ptr[4],
+                                          int ref_stride, uint32_t res[4],
+                                          int h) {
+  highbd_sadwxhx4d_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32, h);
+}
+
+#define HBD_SAD_WXH_4D_NEON(w, h)                                            \
+  void vpx_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_neon(src, src_stride, ref_array, ref_stride,        \
+                              sad_array, (h));                               \
+  }
+
+HBD_SAD_WXH_4D_NEON(4, 4)
+HBD_SAD_WXH_4D_NEON(4, 8)
+
+HBD_SAD_WXH_4D_NEON(8, 4)
+HBD_SAD_WXH_4D_NEON(8, 8)
+HBD_SAD_WXH_4D_NEON(8, 16)
+
+HBD_SAD_WXH_4D_NEON(16, 8)
+HBD_SAD_WXH_4D_NEON(16, 16)
+HBD_SAD_WXH_4D_NEON(16, 32)
+
+HBD_SAD_WXH_4D_NEON(32, 16)
+HBD_SAD_WXH_4D_NEON(32, 32)
+HBD_SAD_WXH_4D_NEON(32, 64)
+
+HBD_SAD_WXH_4D_NEON(64, 32)
+HBD_SAD_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_WXH_4D_NEON
+
+#define HBD_SAD_SKIP_WXH_4D_NEON(w, h)                                        \
+  void vpx_highbd_sad_skip_##w##x##h##x4d_neon(                               \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    highbd_sad##w##xhx4d_neon(src, 2 * src_stride, ref_array, 2 * ref_stride, \
+                              sad_array, ((h) >> 1));                         \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+HBD_SAD_SKIP_WXH_4D_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/arm/highbd_sad_neon.c b/vpx_dsp/arm/highbd_sad_neon.c
index ecb52ce5a..b99bac66c 100644
--- a/vpx_dsp/arm/highbd_sad_neon.c
+++ b/vpx_dsp/arm/highbd_sad_neon.c
@@ -17,209 +17,392 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad4xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
-      sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
-    }
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum);
 }
 
-static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
-                                                  int src_stride,
-                                                  const uint8_t *ref_ptr,
-                                                  int ref_stride, int width,
-                                                  int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad8xh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
-      sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
-                               vget_high_u16(ref_u16));
-    }
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    sum = vabaq_u16(sum, s, r);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint16x8(sum);
 }
 
-static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sad16xh_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 4) {
-      const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
-      const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
-      const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
-      const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
-      sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
-    }
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint16x8_t diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    diff0 = vabdq_u16(s0, r0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    diff1 = vabdq_u16(s1, r1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+  } while (--i != 0);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
 }
 
-static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
-    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
-    int ref_stride, const uint8_t *second_pred, int width, int height) {
-  int i, j;
-  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+static INLINE uint32_t highbd_sadwxh_neon(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride, int w, int h) {
   const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
   const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
-  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j += 8) {
-      const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
-      const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
-      const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
-      const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
-      sum_abs_diff =
-          vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
-    }
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3;
+      uint16x8_t diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      diff0 = vabdq_u16(s0, r0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      diff1 = vabdq_u16(s1, r1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      diff2 = vabdq_u16(s2, r2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      diff3 = vabdq_u16(s3, r3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
     src16_ptr += src_stride;
     ref16_ptr += ref_stride;
-    pred_ptr += width;
-  }
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
 
-  return horizontal_add_uint32x4(sum_abs_diff);
+  return horizontal_add_uint32x4(sum[0]);
 }
 
-#define highbd_sad4MxN(m, n)                                                 \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
-  }
+static INLINE unsigned int highbd_sad64xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
 
-#define highbd_sadMxN(m, n)                                                  \
-  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
-  }
+static INLINE unsigned int highbd_sad32xh_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h) {
+  return highbd_sadwxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
 
-#define highbd_sad4MxN_avg(m, n)                                          \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
+#define HBD_SAD_WXH_NEON(w, h)                                            \
+  unsigned int vpx_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_neon(src, src_stride, ref, ref_stride, (h)); \
   }
 
-#define highbd_sadMxN_avg(m, n)                                           \
-  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
-      int ref_stride, const uint8_t *second_pred) {                       \
-    return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
-                                second_pred, m, n);                       \
+HBD_SAD_WXH_NEON(4, 4)
+HBD_SAD_WXH_NEON(4, 8)
+
+HBD_SAD_WXH_NEON(8, 4)
+HBD_SAD_WXH_NEON(8, 8)
+HBD_SAD_WXH_NEON(8, 16)
+
+HBD_SAD_WXH_NEON(16, 8)
+HBD_SAD_WXH_NEON(16, 16)
+HBD_SAD_WXH_NEON(16, 32)
+
+HBD_SAD_WXH_NEON(32, 16)
+HBD_SAD_WXH_NEON(32, 32)
+HBD_SAD_WXH_NEON(32, 64)
+
+HBD_SAD_WXH_NEON(64, 32)
+HBD_SAD_WXH_NEON(64, 64)
+
+#undef HBD_SAD_WXH_NEON
+
+#define HBD_SAD_SKIP_WXH_NEON(w, h)                             \
+  unsigned int vpx_highbd_sad_skip_##w##x##h##_neon(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,   \
+      int ref_stride) {                                         \
+    return 2 * highbd_sad##w##xh_neon(src, 2 * src_stride, ref, \
+                                      2 * ref_stride, (h) / 2); \
   }
 
-#define highbd_sadMxNx4D(m, n)                                                 \
-  void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
-      const uint8_t *src_ptr, int src_stride,                                  \
-      const uint8_t *const ref_array[4], int ref_stride,                       \
-      uint32_t sad_array[4]) {                                                 \
-    int i;                                                                     \
-    for (i = 0; i < 4; ++i) {                                                  \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride,       \
-                                                    ref_array[i], ref_stride); \
-    }                                                                          \
+HBD_SAD_SKIP_WXH_NEON(4, 4)
+HBD_SAD_SKIP_WXH_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_NEON(8, 4)
+HBD_SAD_SKIP_WXH_NEON(8, 8)
+HBD_SAD_SKIP_WXH_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_NEON(16, 8)
+HBD_SAD_SKIP_WXH_NEON(16, 16)
+HBD_SAD_SKIP_WXH_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_NEON(32, 16)
+HBD_SAD_SKIP_WXH_NEON(32, 32)
+HBD_SAD_SKIP_WXH_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_NEON(64, 32)
+HBD_SAD_SKIP_WXH_NEON(64, 64)
+
+#undef HBD_SAD_SKIP_WXH_NEON
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_uint32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t vpx_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
   }
 
-/* clang-format off */
-// 4x4
-highbd_sad4MxN(4, 4)
-highbd_sad4MxN_avg(4, 4)
-highbd_sadMxNx4D(4, 4)
-
-// 4x8
-highbd_sad4MxN(4, 8)
-highbd_sad4MxN_avg(4, 8)
-highbd_sadMxNx4D(4, 8)
-
-// 8x4
-highbd_sadMxN(8, 4)
-highbd_sadMxN_avg(8, 4)
-highbd_sadMxNx4D(8, 4)
-
-// 8x8
-highbd_sadMxN(8, 8)
-highbd_sadMxN_avg(8, 8)
-highbd_sadMxNx4D(8, 8)
-
-// 8x16
-highbd_sadMxN(8, 16)
-highbd_sadMxN_avg(8, 16)
-highbd_sadMxNx4D(8, 16)
-
-// 16x8
-highbd_sadMxN(16, 8)
-highbd_sadMxN_avg(16, 8)
-highbd_sadMxNx4D(16, 8)
-
-// 16x16
-highbd_sadMxN(16, 16)
-highbd_sadMxN_avg(16, 16)
-highbd_sadMxNx4D(16, 16)
-
-// 16x32
-highbd_sadMxN(16, 32)
-highbd_sadMxN_avg(16, 32)
-highbd_sadMxNx4D(16, 32)
-
-// 32x16
-highbd_sadMxN(32, 16)
-highbd_sadMxN_avg(32, 16)
-highbd_sadMxNx4D(32, 16)
-
-// 32x32
-highbd_sadMxN(32, 32)
-highbd_sadMxN_avg(32, 32)
-highbd_sadMxNx4D(32, 32)
-
-// 32x64
-highbd_sadMxN(32, 64)
-highbd_sadMxN_avg(32, 64)
-highbd_sadMxNx4D(32, 64)
-
-// 64x32
-highbd_sadMxN(64, 32)
-highbd_sadMxN_avg(64, 32)
-highbd_sadMxNx4D(64, 32)
-
-// 64x64
-highbd_sadMxN(64, 64)
-highbd_sadMxN_avg(64, 64)
-highbd_sadMxNx4D(64, 64)
-    /* clang-format on */
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
diff --git a/vpx_dsp/arm/highbd_sse_neon.c b/vpx_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 000000000..ee76bed58
--- /dev/null
+++ b/vpx_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,237 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_uint32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+      uint16x8_t abs_diff;
+      uint16x4_t abs_diff_lo;
+      uint16x4_t abs_diff_hi;
+      uint32x4_t sse_u32;
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      abs_diff = vabdq_u16(s, r);
+      abs_diff_lo = vget_low_u16(abs_diff);
+      abs_diff_hi = vget_high_u16(abs_diff);
+
+      sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_uint32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t vpx_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/vpx_dsp/arm/highbd_subpel_variance_neon.c b/vpx_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 000000000..683df5797
--- /dev/null
+++ b/vpx_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,586 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return vpx_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                      \
+  uint32_t vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t tmp0[w * (h + 1)];                                               \
+    uint16_t tmp1[w * h];                                                     \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                             \
+                                                                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \
+                                       xoffset);                              \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                              \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));      \
+                                                                              \
+    return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(                  \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                   \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int vpx_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return vpx_highbd_##bitdepth##_variance##w##x##h##_neon(               \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
diff --git a/vpx_dsp/arm/highbd_variance_neon.c b/vpx_dsp/arm/highbd_variance_neon.c
index 96a35af01..309ae7fd3 100644
--- a/vpx_dsp/arm/highbd_variance_neon.c
+++ b/vpx_dsp/arm/highbd_variance_neon.c
@@ -18,479 +18,419 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
-static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
-                                     const uint16_t *ref_ptr, int ref_stride,
-                                     int w, int h, uint64_t *sse,
-                                     int64_t *sum) {
-  int i, j;
-
-  if (w >= 8) {
-    int32x4_t sum_s32 = vdupq_n_s32(0);
-    uint32x4_t sse_u32 = vdupq_n_u32(0);
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
-        const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
-        const int32x4_t diff1_s32 =
-            vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
-        const int32x4_t diff2_s32 =
-            vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
-        const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
-        const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff1_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff2_s32);
-        sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
-        sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
-      }
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-    }
-    *sum = horizontal_add_int32x4(sum_s32);
-    *sse = horizontal_add_uint32x4(sse_u32);
-  } else {
-    int32x4_t sum_s32 = vdupq_n_s32(0);
-    uint32x4_t sse_u32 = vdupq_n_u32(0);
-    assert(w >= 4);
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 4) {
-        const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
-        const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
-        const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
-        const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
-        sum_s32 = vaddq_s32(sum_s32, diff_s32);
-        sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
-      }
-      src_ptr += src_stride;
-      ref_ptr += ref_stride;
-    }
-    *sum = horizontal_add_int32x4(sum_s32);
-    *sse = horizontal_add_uint32x4(sse_u32);
-  }
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  int i = h;
+  do {
+    const uint16x8_t s = load_unaligned_u16q(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16q(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int16x8(sum_s16);
+  *sse = horizontal_add_int32x4(sse_s32);
 }
 
-static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
-                                     const uint8_t *ref8_ptr, int ref_stride,
-                                     int w, int h, uint64_t *sse,
-                                     int64_t *sum) {
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
-  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
-
-  if (w < 32 && h < 32) {
-    highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
-  } else {
-    uint64_t sse_long = 0;
-    int64_t sum_long = 0;
-    int k, l;
-    for (k = 0; k + 16 <= h; k += 16) {
-      for (l = 0; l + 16 <= w; l += 16) {
-        uint64_t sse_tmp = 0;
-        int64_t sum_tmp = 0;
-        highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
-                          16, &sse_tmp, &sum_tmp);
-        sum_long += sum_tmp;
-        sse_long += sse_tmp;
-      }
-      src_ptr += 16 * src_stride;
-      ref_ptr += 16 * ref_stride;
-    }
-    *sum = sum_long;
-    *sse = sse_long;
-  }
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*64*16 = 1071645696
+// for a 64x64 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+                                              int src_stride,
+                                              const uint16_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src_ptr + j);
+      const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+      sum_s32 = vpadalq_s16(sum_s32, diff);
+
+      sse_s32[0] =
+          vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+      sse_s32[1] =
+          vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = horizontal_long_add_uint32x4(vaddq_u32(
+      vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
 }
 
-static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
-                                     const uint8_t *ref8_ptr, int ref_stride,
-                                     int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)sse_long;
-  *sum = (int)sum_long;
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
 }
 
-static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride,
-                                      int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
 }
 
-static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
-                                      const uint8_t *ref8_ptr, int ref_stride,
-                                      int w, int h, uint32_t *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
-                    &sum_long);
-  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
 }
 
-#define HIGHBD_VAR(W, H)                                                    \
-  uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
-                      &sum);                                                \
-    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_10_variance##W##x##H##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_12_variance##W##x##H##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    int64_t var;                                                            \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
-    return (var >= 0) ? (uint32_t)var : 0;                                  \
-  }
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
 
-#define HIGHBD_GET_VAR(S)                                                   \
-  void vpx_highbd_8_get##S##x##S##var_neon(                                 \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
-                      sum);                                                 \
-  }                                                                         \
-                                                                            \
-  void vpx_highbd_10_get##S##x##S##var_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
-                       sum);                                                \
-  }                                                                         \
-                                                                            \
-  void vpx_highbd_12_get##S##x##S##var_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse, int *sum) {                            \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
-                       sum);                                                \
-  }
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32 and 64x64 are processed in a different
+// helper function.
 
-#define HIGHBD_MSE(W, H)                                                    \
-  uint32_t vpx_highbd_8_mse##W##x##H##_neon(                                \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
-                      &sum);                                                \
-    return *sse;                                                            \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_10_mse##W##x##H##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    return *sse;                                                            \
-  }                                                                         \
-                                                                            \
-  uint32_t vpx_highbd_12_mse##W##x##H##_neon(                               \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
-      int ref_stride, uint32_t *sse) {                                      \
-    int sum;                                                                \
-    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
-                       &sum);                                               \
-    return *sse;                                                            \
-  }
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+    const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+    int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
 
-static INLINE void highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+        const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+        const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+        sum_s32 = vpadalq_s16(sum_s32, diff);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+        j += 8;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint64_t)horizontal_add_int64x2(sse_s64);
 }
 
-static INLINE void highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr, uint16_t *output_ptr,
-    unsigned int src_pixels_per_line, unsigned int pixel_step,
-    unsigned int output_height, unsigned int output_width,
-    const uint8_t *filter) {
-  uint32_t i, j;
-
-  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
-  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
-  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
-
-  if (output_width >= 8) {
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 8) {
-        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
-        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
-        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
-        uint16x4_t out1_u16;
-        uint16x4_t out2_u16;
-        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
-        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
-        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
-        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
-        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  } else {
-    assert(output_width >= 4);
-    for (i = 0; i < output_height; ++i) {
-      for (j = 0; j < output_width; j += 4) {
-        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
-        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
-        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
-        uint16x4_t out_u16;
-        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
-        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
-        vst1_u16(&output_ptr[j], out_u16);
-      }
-      // Next row...
-      src_ptr += src_pixels_per_line;
-      output_ptr += output_width;
-    }
-  }
+static INLINE void highbd_variance_32xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+                              sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+                              sum);
 }
 
-#define HIGHBD_SUBPIX_VAR(W, H)                                                \
-  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon(                    \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon(                   \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
+#define HBD_VARIANCE_WXH_8_NEON(w, h)                                 \
+  uint32_t vpx_highbd_8_variance##w##x##h##_neon(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
   }
 
-#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
-  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon(                \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
-                                                 ref_ptr, ref_stride, sse);    \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_10_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon(               \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
-      const uint8_t *second_pred) {                                            \
-    uint16_t fdata3[(H + 1) * W];                                              \
-    uint16_t temp2[H * W];                                                     \
-    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
-                                                                               \
-    highbd_var_filter_block2d_bil_first_pass(                                  \
-        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
-    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
-                                              bilinear_filters[y_offset]);     \
-                                                                               \
-    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
-                                  H, temp2, W);                                \
-                                                                               \
-    return vpx_highbd_12_variance##W##x##H##_neon(                             \
-        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
+#define HBD_VARIANCE_WXH_10_NEON(w, h)                                \
+  uint32_t vpx_highbd_10_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
   }
 
-void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
-                                   int width, int height, const uint16_t *ref,
-                                   int ref_stride) {
-  int i, j;
-  uint32x4_t one_u32 = vdupq_n_u32(1);
-  if (width >= 8) {
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 8) {
-        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
-        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
-        const uint32x4_t sum1_u32 =
-            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
-        const uint32x4_t sum2_u32 =
-            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
-        const uint16x4_t sum1_u16 =
-            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
-        const uint16x4_t sum2_u16 =
-            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
-        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
-        vst1q_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
-  } else {
-    assert(width >= 4);
-    for (i = 0; i < height; ++i) {
-      for (j = 0; j < width; j += 4) {
-        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
-        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
-        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
-        const uint16x4_t vcomp_pred =
-            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
-        vst1_u16(&comp_pred[j], vcomp_pred);
-      }
-      comp_pred += width;
-      pred += width;
-      ref += ref_stride;
-    }
+#define HBD_VARIANCE_WXH_12_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
   }
-}
 
-/* All three forms of the variance are available in the same sizes. */
-#define HIGHBD_VARIANCES(W, H) \
-  HIGHBD_VAR(W, H)             \
-  HIGHBD_SUBPIX_VAR(W, H)      \
-  HIGHBD_SUBPIX_AVG_VAR(W, H)
-
-HIGHBD_VARIANCES(64, 64)
-HIGHBD_VARIANCES(64, 32)
-HIGHBD_VARIANCES(32, 64)
-HIGHBD_VARIANCES(32, 32)
-HIGHBD_VARIANCES(32, 16)
-HIGHBD_VARIANCES(16, 32)
-HIGHBD_VARIANCES(16, 16)
-HIGHBD_VARIANCES(16, 8)
-HIGHBD_VARIANCES(8, 16)
-HIGHBD_VARIANCES(8, 8)
-HIGHBD_VARIANCES(8, 4)
-HIGHBD_VARIANCES(4, 8)
-HIGHBD_VARIANCES(4, 4)
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h)                                \
+  uint32_t vpx_highbd_12_variance##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, uint32_t *sse) {                                       \
+    int sum;                                                                 \
+    int64_t var;                                                             \
+    uint64_t sse_long = 0;                                                   \
+    int64_t sum_long = 0;                                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                            \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+                                        &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                        \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                              \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                   \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+
+#define HIGHBD_GET_VAR(S)                                             \
+  void vpx_highbd_8_get##S##x##S##var_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    *sum = (int)sum_long;                                             \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_10_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                      \
+  }                                                                   \
+                                                                      \
+  void vpx_highbd_12_get##S##x##S##var_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse, int *sum) {                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##S##xh_neon(src, src_stride, ref, ref_stride, S, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                      \
+  }
 
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u32[0] =
+          vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+      sse_u32[1] =
+          vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int h) {
+  return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h);
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h)                                         \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon(                              \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_10_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                                   \
+    return *sse;                                                          \
+  }                                                                       \
+                                                                          \
+  uint32_t vpx_highbd_12_mse##w##x##h##_neon(                             \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    *sse = highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h);   \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                                   \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/highbd_variance_neon_dotprod.c b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 000000000..1a8872017
--- /dev/null
+++ b/vpx_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1;
+    uint8x16_t s, r, diff;
+
+    s0 = vld1q_u16(src_ptr);
+    s1 = vld1q_u16(src_ptr + 8);
+    r0 = vld1q_u16(ref_ptr);
+    r1 = vld1q_u16(ref_ptr + 8);
+
+    s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse_u32);
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                      \
+  uint32_t vpx_highbd_8_mse##w##x##h##_neon_dotprod(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,          \
+      int ref_stride, uint32_t *sse) {                                         \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                              \
+    *sse =                                                                     \
+        highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h); \
+    return *sse;                                                               \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
index c46c01631..47684473c 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c
@@ -355,7 +355,6 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
   } else {
     const int16x8_t filters = vld1q_s16(filter[x0_q4]);
     const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
-    uint16x8_t t0, t1, t2, t3;
 
     assert(!((intptr_t)dst & 3));
     assert(!(dst_stride & 3));
@@ -365,6 +364,7 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src,
     if (h == 4) {
       int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
       int32x4_t d0, d1, d2, d3;
+      uint16x8_t t0, t1, t2, t3;
       uint16x8_t d01, d23, t01, t23;
 
       __builtin_prefetch(src + 0 * src_stride);
diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
index 9d2752e09..775108208 100644
--- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c
@@ -26,76 +26,88 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
   (void)bd;
 
   if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
     do {
-      vst1_u16(dst, vld1_u16(src));
+      s0 = vld1_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1_u16(dst, vld1_u16(src));
+      s1 = vld1_u16(src);
       src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
     do {
-      vst1q_u16(dst, vld1q_u16(src));
+      s0 = vld1q_u16(src);
       src += src_stride;
-      dst += dst_stride;
-      vst1q_u16(dst, vld1q_u16(src));
+      s1 = vld1q_u16(src);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
       dst += dst_stride;
       h -= 2;
-    } while (h > 0);
+    } while (h != 0);
   } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
       src += src_stride;
-      dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
       dst += dst_stride;
-      vst2q_u16(dst, vld2q_u16(src));
-      src += src_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+      h -= 2;
+    } while (h != 0);
   } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
       src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
       dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+    } while (--h != 0);
   } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
     do {
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      s4 = vld1q_u16(src + 32);
+      s5 = vld1q_u16(src + 40);
+      s6 = vld1q_u16(src + 48);
+      s7 = vld1q_u16(src + 56);
       src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      vst4q_u16(dst, vld4q_u16(src));
-      vst4q_u16(dst + 32, vld4q_u16(src + 32));
-      src += src_stride;
-      dst += dst_stride;
-      h -= 4;
-    } while (h > 0);
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      vst1q_u16(dst + 32, s4);
+      vst1q_u16(dst + 40, s5);
+      vst1q_u16(dst + 48, s6);
+      vst1q_u16(dst + 56, s7);
+      dst += dst_stride;
+    } while (--h != 0);
   }
 }
diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c
index 38e275834..4f909e493 100644
--- a/vpx_dsp/arm/intrapred_neon.c
+++ b/vpx_dsp/arm/intrapred_neon.c
@@ -12,51 +12,47 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
 #include "vpx/vpx_integer.h"
 
 //------------------------------------------------------------------------------
 // DC 4x4
 
-static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  const uint16x4_t p0 = vpaddl_u8(ref_u8);
-  return vpadd_u16(p0, p0);
+static INLINE uint16_t dc_sum_4(const uint8_t *ref) {
+  return horizontal_add_uint8x4(load_unaligned_u8_4x1(ref));
 }
 
 static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc), 0);
   }
 }
 
 void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t a = vld1_u8(above);
-  const uint8x8_t l = vld1_u8(left);
-  const uint16x8_t al = vaddl_u8(a, l);
-  uint16x4_t sum;
-  uint8x8_t dc;
-  sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al));
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint8x8_t a = load_unaligned_u8_4x1(above);
+  const uint8x8_t l = load_unaligned_u8_4x1(left);
+  const uint16x4_t al = vget_low_u16(vaddl_u8(a, l));
+  const uint16_t sum = horizontal_add_uint16x4(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)above;
   dc_store_4x4(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_4(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2));
+  const uint16_t sum = dc_sum_4(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 2);
   (void)left;
   dc_store_4x4(dst, stride, dc);
 }
@@ -72,19 +68,15 @@ void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 8x8
 
-static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) {
-  const uint8x8_t ref_u8 = vld1_u8(ref);
-  uint16x4_t sum = vpaddl_u8(ref_u8);
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_8(const uint8_t *ref) {
+  return horizontal_add_uint8x8(vld1_u8(ref));
 }
 
 static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride,
                                 const uint8x8_t dc) {
-  const uint8x8_t dc_dup = vdup_lane_u8(dc, 0);
   int i;
   for (i = 0; i < 8; ++i, dst += stride) {
-    vst1_u8(dst, dc_dup);
+    vst1_u8(dst, dc);
   }
 }
 
@@ -92,28 +84,24 @@ void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
   const uint8x8_t above_u8 = vld1_u8(above);
   const uint8x8_t left_u8 = vld1_u8(left);
-  const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8);
-  const uint16x8_t p0 = vpaddlq_u8(above_and_left);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16x8_t al = vaddl_u8(above_u8, left_u8);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 4);
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                     const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(left);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)above;
   dc_store_8x8(dst, stride, dc);
 }
 
 void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                    const uint8_t *above, const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_8(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3));
+  const uint16_t sum = dc_sum_8(above);
+  const uint8x8_t dc = vrshrn_n_u16(vdupq_n_u16(sum), 3);
   (void)left;
   dc_store_8x8(dst, stride, dc);
 }
@@ -129,20 +117,15 @@ void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 16x16
 
-static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) {
-  const uint8x16_t ref_u8 = vld1q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(ref_u8);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_16(const uint8_t *ref) {
+  return horizontal_add_uint8x16(vld1q_u8(ref));
 }
 
 static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0);
+                                  const uint8x16_t dc) {
   int i;
   for (i = 0; i < 16; ++i, dst += stride) {
-    vst1q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
   }
 }
 
@@ -150,22 +133,19 @@ void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const uint8x16_t ref0 = vld1q_u8(above);
   const uint8x16_t ref1 = vld1q_u8(left);
-  const uint16x8_t p0 = vpaddlq_u8(ref0);
-  const uint16x8_t p1 = vpaddlq_u8(ref1);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16x8_t a = vpaddlq_u8(ref0);
+  const uint16x8_t l = vpaddlq_u8(ref1);
+  const uint16x8_t al = vaddq_u16(a, l);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   dc_store_16x16(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)above;
   dc_store_16x16(dst, stride, dc);
 }
@@ -173,8 +153,8 @@ void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_16(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4));
+  const uint16_t sum = dc_sum_16(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 4), 0);
   (void)left;
   dc_store_16x16(dst, stride, dc);
 }
@@ -182,7 +162,7 @@ void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_16x16(dst, stride, dc);
@@ -191,51 +171,41 @@ void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
 //------------------------------------------------------------------------------
 // DC 32x32
 
-static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) {
-  const uint8x16x2_t r = vld2q_u8(ref);
-  const uint16x8_t p0 = vpaddlq_u8(r.val[0]);
-  const uint16x8_t p1 = vpaddlq_u8(r.val[1]);
-  const uint16x8_t p2 = vaddq_u16(p0, p1);
-  uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
-  sum = vpadd_u16(sum, sum);
-  return vpadd_u16(sum, sum);
+static INLINE uint16_t dc_sum_32(const uint8_t *ref) {
+  const uint8x16_t r0 = vld1q_u8(ref + 0);
+  const uint8x16_t r1 = vld1q_u8(ref + 16);
+  const uint16x8_t r01 = vaddq_u16(vpaddlq_u8(r0), vpaddlq_u8(r1));
+  return horizontal_add_uint16x8(r01);
 }
 
 static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride,
-                                  const uint8x8_t dc) {
-  uint8x16x2_t dc_dup;
+                                  const uint8x16_t dc) {
   int i;
-  dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0);
-
   for (i = 0; i < 32; ++i, dst += stride) {
-    vst2q_u8(dst, dc_dup);
+    vst1q_u8(dst + 0, dc);
+    vst1q_u8(dst + 16, dc);
   }
 }
 
 void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
-  const uint8x16x2_t a = vld2q_u8(above);
-  const uint8x16x2_t l = vld2q_u8(left);
-  const uint16x8_t pa0 = vpaddlq_u8(a.val[0]);
-  const uint16x8_t pl0 = vpaddlq_u8(l.val[0]);
-  const uint16x8_t pa1 = vpaddlq_u8(a.val[1]);
-  const uint16x8_t pl1 = vpaddlq_u8(l.val[1]);
-  const uint16x8_t pa = vaddq_u16(pa0, pa1);
-  const uint16x8_t pl = vaddq_u16(pl0, pl1);
-  const uint16x8_t pal = vaddq_u16(pa, pl);
-  uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal));
-  uint8x8_t dc;
-  sum = vpadd_u16(sum, sum);
-  sum = vpadd_u16(sum, sum);
-  dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6));
+  const uint8x16_t a0 = vld1q_u8(above + 0);
+  const uint8x16_t a1 = vld1q_u8(above + 16);
+  const uint8x16_t l0 = vld1q_u8(left + 0);
+  const uint8x16_t l1 = vld1q_u8(left + 16);
+  const uint16x8_t a01 = vaddq_u16(vpaddlq_u8(a0), vpaddlq_u8(a1));
+  const uint16x8_t l01 = vaddq_u16(vpaddlq_u8(l0), vpaddlq_u8(l1));
+  const uint16x8_t al = vaddq_u16(a01, l01);
+  const uint16_t sum = horizontal_add_uint16x8(al);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 6), 0);
   dc_store_32x32(dst, stride, dc);
 }
 
 void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(left);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(left);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)above;
   dc_store_32x32(dst, stride, dc);
 }
@@ -243,8 +213,8 @@ void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint16x4_t sum = dc_sum_32(above);
-  const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5));
+  const uint16_t sum = dc_sum_32(above);
+  const uint8x16_t dc = vdupq_lane_u8(vrshrn_n_u16(vdupq_n_u16(sum), 5), 0);
   (void)left;
   dc_store_32x32(dst, stride, dc);
 }
@@ -252,7 +222,7 @@ void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
-  const uint8x8_t dc = vdup_n_u8(0x80);
+  const uint8x16_t dc = vdupq_n_u8(0x80);
   (void)above;
   (void)left;
   dc_store_32x32(dst, stride, dc);
@@ -262,123 +232,629 @@ void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
 
 void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t ABCDEFGH = vld1_u8(above);
-  const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8);
-  const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16);
-  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
-  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
-  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
-  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  uint8x8_t a0, a1, a2, d0;
+  uint8_t a7;
   (void)left;
-  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
-  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
-  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
-  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
-  vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7);
-}
 
-static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride,
-                               const uint8x8_t above_right, uint8x8_t *row) {
-  *row = vext_u8(*row, above_right, 1);
-  vst1_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above);
+  a7 = above[7];
+
+  // [ above[1], ..., above[6], x, x ]
+  a1 = vext_u8(a0, a0, 1);
+  // [ above[2], ..., above[7], x, x ]
+  a2 = vext_u8(a0, a0, 2);
+
+  // d0[0] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[5] = AVG3(above[5], above[6], above[7]);
+  // d0[6] = x (don't care)
+  // d0[7] = x (don't care)
+  d0 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  // We want:
+  // stride=0 [ d0[0], d0[1], d0[2],    d0[3] ]
+  // stride=1 [ d0[1], d0[2], d0[3],    d0[4] ]
+  // stride=2 [ d0[2], d0[3], d0[4],    d0[5] ]
+  // stride=2 [ d0[3], d0[4], d0[5], above[7] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, vext_u8(d0, d0, 1));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d0, d0, 2));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d0, d0, 3));
+
+  // We stored d0[6] above, so fixup into above[7].
+  dst[3 * stride + 3] = a7;
 }
 
 void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
-  const uint8x8_t A0 = vld1_u8(above);
-  const uint8x8_t above_right = vdup_lane_u8(A0, 7);
-  const uint8x8_t A1 = vext_u8(A0, above_right, 1);
-  const uint8x8_t A2 = vext_u8(A0, above_right, 2);
-  const uint8x8_t avg1 = vhadd_u8(A0, A2);
-  uint8x8_t row = vrhadd_u8(avg1, A1);
+  uint8x8_t ax0, a0, a1, a7, d0;
   (void)left;
 
-  vst1_u8(dst, row);
-  dst += stride;
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  d45_store_8(&dst, stride, above_right, &row);
-  vst1_u8(dst, above_right);
-}
-
-static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride,
-                                const uint8x16_t above_right, uint8x16_t *row) {
-  *row = vextq_u8(*row, above_right, 1);
-  vst1q_u8(*dst, *row);
-  *dst += stride;
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a7 = vld1_dup_u8(above + 7);
+
+  // We want to calculate the AVG3 result in lanes 1-7 inclusive so we can
+  // shift in above[7] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[6] ]
+  ax0 = vext_u8(a0, a0, 7);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[7] = AVG3(above[6], above[7], above[8]);
+  d0 = vrhadd_u8(vhadd_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[7].
+  vst1_u8(dst + 0 * stride, vext_u8(d0, a7, 1));
+  vst1_u8(dst + 1 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 5));
+  vst1_u8(dst + 5 * stride, vext_u8(d0, a7, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 7));
+  vst1_u8(dst + 7 * stride, a7);
 }
 
 void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0 = vld1q_u8(above);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7);
-  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
-  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
-  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
-  uint8x16_t row = vrhaddq_u8(avg1, A1);
+  uint8x16_t ax0, a0, a1, a15, d0;
   (void)left;
 
-  vst1q_u8(dst, row);
-  dst += stride;
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  d45_store_16(&dst, stride, above_right, &row);
-  vst1q_u8(dst, above_right);
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_dup_u8(above + 15);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0 = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride, vextq_u8(d0, a15, 1));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 9));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d0, a15, 10));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d0, a15, 12));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 13));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d0, a15, 14));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 15));
+  vst1q_u8(dst + 15 * stride, a15);
 }
 
 void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                   const uint8_t *above, const uint8_t *left) {
-  const uint8x16_t A0_0 = vld1q_u8(above);
-  const uint8x16_t A0_1 = vld1q_u8(above + 16);
-  const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7);
-  const uint8x16_t A1_0 = vld1q_u8(above + 1);
-  const uint8x16_t A1_1 = vld1q_u8(above + 17);
-  const uint8x16_t A2_0 = vld1q_u8(above + 2);
-  const uint8x16_t A2_1 = vld1q_u8(above + 18);
-  const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0);
-  const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1);
-  uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0);
-  uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1);
-  int i;
+  uint8x16_t ax0, a0, a1, a15, a16, a17, a31, d0[2];
   (void)left;
 
-  vst1q_u8(dst, row_0);
-  dst += 16;
-  vst1q_u8(dst, row_1);
-  dst += stride - 16;
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a31 = vld1q_dup_u8(above + 31);
+
+  // We want to calculate the AVG3 result in lanes 1-15 inclusive so we can
+  // shift in above[15] later, so shift a0 across by one to get the right
+  // inputs:
+  // [ x, above[0], ... , above[14] ]
+  ax0 = vextq_u8(a0, a0, 15);
+
+  // d0[0] = x (don't care)
+  // d0[1] = AVG3(above[0], above[1], above[2]);
+  // ...
+  // d0[15] = AVG3(above[14], above[15], above[16]);
+  d0[0] = vrhaddq_u8(vhaddq_u8(ax0, a1), a0);
+  d0[1] = vrhaddq_u8(vhaddq_u8(a15, a17), a16);
+
+  // Undo the earlier ext, incrementally shift in duplicates of above[15].
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d0[0], d0[1], 1));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d0[0], d0[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0[0], d0[1], 3));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d0[0], d0[1], 4));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0[0], d0[1], 5));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d0[0], d0[1], 6));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0[0], d0[1], 7));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d0[0], d0[1], 8));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0[0], d0[1], 9));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d0[0], d0[1], 10));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0[0], d0[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d0[0], d0[1], 12));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0[0], d0[1], 13));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d0[0], d0[1], 14));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0[0], d0[1], 15));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 15 * stride + 0, d0[1]);
+  vst1q_u8(dst + 15 * stride + 16, a31);
+
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0[1], a31, 1));
+  vst1q_u8(dst + 16 * stride + 16, a31);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d0[1], a31, 2));
+  vst1q_u8(dst + 17 * stride + 16, a31);
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0[1], a31, 3));
+  vst1q_u8(dst + 18 * stride + 16, a31);
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d0[1], a31, 4));
+  vst1q_u8(dst + 19 * stride + 16, a31);
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0[1], a31, 5));
+  vst1q_u8(dst + 20 * stride + 16, a31);
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d0[1], a31, 6));
+  vst1q_u8(dst + 21 * stride + 16, a31);
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0[1], a31, 7));
+  vst1q_u8(dst + 22 * stride + 16, a31);
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d0[1], a31, 8));
+  vst1q_u8(dst + 23 * stride + 16, a31);
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0[1], a31, 9));
+  vst1q_u8(dst + 24 * stride + 16, a31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d0[1], a31, 10));
+  vst1q_u8(dst + 25 * stride + 16, a31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0[1], a31, 11));
+  vst1q_u8(dst + 26 * stride + 16, a31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d0[1], a31, 12));
+  vst1q_u8(dst + 27 * stride + 16, a31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0[1], a31, 13));
+  vst1q_u8(dst + 28 * stride + 16, a31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d0[1], a31, 14));
+  vst1q_u8(dst + 29 * stride + 16, a31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d0[1], a31, 15));
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, a31);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
 
-  for (i = 0; i < 30; ++i) {
-    row_0 = vextq_u8(row_0, row_1, 1);
-    row_1 = vextq_u8(row_1, above_right, 1);
-    vst1q_u8(dst, row_0);
-    dst += 16;
-    vst1q_u8(dst, row_1);
-    dst += stride - 16;
-  }
+// -----------------------------------------------------------------------------
+
+void vpx_d63_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a3, d0, d1, d2, d3;
+  (void)left;
+
+  a0 = load_unaligned_u8_4x1(above + 0);
+  a1 = load_unaligned_u8_4x1(above + 1);
+  a2 = load_unaligned_u8_4x1(above + 2);
+  a3 = load_unaligned_u8_4x1(above + 3);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+  d2 = vrhadd_u8(a1, a2);
+  d3 = vrhadd_u8(vhadd_u8(a1, a3), a2);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d63_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a0, a1, a2, a7, d0, d1;
+  (void)left;
+
+  a0 = vld1_u8(above + 0);
+  a1 = vld1_u8(above + 1);
+  a2 = vld1_u8(above + 2);
+  a7 = vld1_dup_u8(above + 7);
+
+  d0 = vrhadd_u8(a0, a1);
+  d1 = vrhadd_u8(vhadd_u8(a0, a2), a1);
+
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+
+  d0 = vext_u8(d0, d0, 7);
+  d1 = vext_u8(d1, d1, 7);
 
-  vst1q_u8(dst, above_right);
-  dst += 16;
-  vst1q_u8(dst, row_1);
+  vst1_u8(dst + 2 * stride, vext_u8(d0, a7, 2));
+  vst1_u8(dst + 3 * stride, vext_u8(d1, a7, 2));
+  vst1_u8(dst + 4 * stride, vext_u8(d0, a7, 3));
+  vst1_u8(dst + 5 * stride, vext_u8(d1, a7, 3));
+  vst1_u8(dst + 6 * stride, vext_u8(d0, a7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(d1, a7, 4));
+}
+
+void vpx_d63_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a15, d0, d1;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a15 = vld1q_dup_u8(above + 15);
+
+  d0 = vrhaddq_u8(a0, a1);
+  d1 = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+
+  d0 = vextq_u8(d0, d0, 15);
+  d1 = vextq_u8(d1, d1, 15);
+
+  vst1q_u8(dst + 2 * stride, vextq_u8(d0, a15, 2));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d1, a15, 2));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d0, a15, 3));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d1, a15, 3));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d0, a15, 4));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d1, a15, 4));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d0, a15, 5));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d1, a15, 5));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d0, a15, 6));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d1, a15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d0, a15, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d1, a15, 7));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d0, a15, 8));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d1, a15, 8));
+}
+
+void vpx_d63_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                  const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a0, a1, a2, a16, a17, a18, a31, d0_lo, d0_hi, d1_lo, d1_hi;
+  (void)left;
+
+  a0 = vld1q_u8(above + 0);
+  a1 = vld1q_u8(above + 1);
+  a2 = vld1q_u8(above + 2);
+  a16 = vld1q_u8(above + 16);
+  a17 = vld1q_u8(above + 17);
+  a18 = vld1q_u8(above + 18);
+  a31 = vld1q_dup_u8(above + 31);
+
+  d0_lo = vrhaddq_u8(a0, a1);
+  d0_hi = vrhaddq_u8(a16, a17);
+  d1_lo = vrhaddq_u8(vhaddq_u8(a0, a2), a1);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a16, a18), a17);
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+
+  d0_hi = vextq_u8(d0_lo, d0_hi, 15);
+  d0_lo = vextq_u8(d0_lo, d0_lo, 15);
+  d1_hi = vextq_u8(d1_lo, d1_hi, 15);
+  d1_lo = vextq_u8(d1_lo, d1_lo, 15);
+
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_hi, a31, 2));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_hi, a31, 2));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_hi, a31, 3));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_hi, a31, 3));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_hi, a31, 4));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_hi, a31, 4));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_hi, a31, 5));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_hi, a31, 5));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_hi, a31, 6));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_hi, a31, 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_hi, a31, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_hi, a31, 7));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_hi, a31, 8));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_hi, a31, 8));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_hi, a31, 9));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_hi, a31, 9));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_hi, a31, 10));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_hi, a31, 10));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_hi, a31, 11));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_hi, a31, 11));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_hi, a31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_hi, a31, 12));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_hi, a31, 13));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_hi, a31, 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_hi, a31, 14));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_hi, a31, 14));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_hi, a31, 15));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_hi, a31, 15));
+  vst1q_u8(dst + 30 * stride + 0, d0_hi);
+  vst1q_u8(dst + 30 * stride + 16, a31);
+  vst1q_u8(dst + 31 * stride + 0, d1_hi);
+  vst1q_u8(dst + 31 * stride + 16, a31);
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d117_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, d0, d1, d2, d3, col0, col1;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  col0 = vdup_n_u8((above[-1] + 2 * left[0] + left[1] + 2) >> 2);
+  col1 = vdup_n_u8((left[0] + 2 * left[1] + left[2] + 2) >> 2);
+
+  d0 = vrhadd_u8(az, a0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vext_u8(col0, d0, 7);
+  d3 = vext_u8(col1, d1, 7);
+
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1(dst + 2 * stride, d2);
+  store_u8_4x1(dst + 3 * stride, d3);
+}
+
+void vpx_d117_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left + 0);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], above[0])
+  // d0[1] = AVG2(above[0], above[1])
+  // ...
+  // d0[7] = AVG2(above[6], above[7])
+  d0 = vrhadd_u8(az, a0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vector to put the elements to be shifted in
+  // at the end. The lowest two lanes here are unused:
+  // col0[7] = AVG3(above[-1], left[0], left[1])
+  // col0[6] = AVG3(left[0], left[1], left[2])
+  // ...
+  // col0[2] = AVG3(left[4], left[5], left[6])
+  // col0[1] = x (don't care)
+  // col0[0] = x (don't care)
+  col0 = vrev64_u8(vrhadd_u8(vhadd_u8(azl0, l1), l0));
+
+  // We don't care about the first parameter to this uzp since we only ever use
+  // the high three elements, we just use col0 again since it is already
+  // available:
+  // col0_even = [ x, x, x, x, x, col0[3], col0[5], col0[7] ]
+  // col0_odd = [ x, x, x, x, x, col0[2], col0[4], col0[6] ]
+  col0_even = vuzp_u8(col0, col0).val[1];
+  col0_odd = vuzp_u8(col0, col0).val[0];
+
+  // Incrementally shift more elements from col0 into d0/1:
+  // stride=0 [ d0[0],   d0[1],   d0[2],   d0[3], d0[4], d0[5], d0[6], d0[7] ]
+  // stride=1 [ d1[0],   d1[1],   d1[2],   d1[3], d1[4], d1[5], d1[6], d1[7] ]
+  // stride=2 [ col0[7], d0[0],   d0[1],   d0[2], d0[3], d0[4], d0[5], d0[6] ]
+  // stride=3 [ col0[6], d1[0],   d1[1],   d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=4 [ col0[5], col0[7], d0[0],   d0[1], d0[2], d0[3], d0[4], d0[5] ]
+  // stride=5 [ col0[4], col0[6], d1[0],   d1[1], d1[2], d1[3], d1[4], d1[5] ]
+  // stride=6 [ col0[3], col0[5], col0[7], d0[0], d0[1], d0[2], d0[3], d0[4] ]
+  // stride=7 [ col0[2], col0[4], col0[6], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  vst1_u8(dst + 0 * stride, d0);
+  vst1_u8(dst + 1 * stride, d1);
+  vst1_u8(dst + 2 * stride, vext_u8(col0_even, d0, 7));
+  vst1_u8(dst + 3 * stride, vext_u8(col0_odd, d1, 7));
+  vst1_u8(dst + 4 * stride, vext_u8(col0_even, d0, 6));
+  vst1_u8(dst + 5 * stride, vext_u8(col0_odd, d1, 6));
+  vst1_u8(dst + 6 * stride, vext_u8(col0_even, d0, 5));
+  vst1_u8(dst + 7 * stride, vext_u8(col0_odd, d1, 5));
+}
+
+void vpx_d117_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, d0, d1, l0, l1, azl0, col0, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(az, a0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+
+  col0 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0 = vrev64q_u8(vextq_u8(col0, col0, 8));
+
+  // The low nine lanes here are unused so the first input to the uzp is
+  // unused, so just use a duplicate of col0 since we have it already. This
+  // also means that the lowest lane of col0 here is unused.
+  col0_even = vuzpq_u8(col0, col0).val[1];
+  col0_odd = vuzpq_u8(col0, col0).val[0];
+
+  vst1q_u8(dst + 0 * stride, d0);
+  vst1q_u8(dst + 1 * stride, d1);
+  vst1q_u8(dst + 2 * stride, vextq_u8(col0_even, d0, 15));
+  vst1q_u8(dst + 3 * stride, vextq_u8(col0_odd, d1, 15));
+  vst1q_u8(dst + 4 * stride, vextq_u8(col0_even, d0, 14));
+  vst1q_u8(dst + 5 * stride, vextq_u8(col0_odd, d1, 14));
+  vst1q_u8(dst + 6 * stride, vextq_u8(col0_even, d0, 13));
+  vst1q_u8(dst + 7 * stride, vextq_u8(col0_odd, d1, 13));
+  vst1q_u8(dst + 8 * stride, vextq_u8(col0_even, d0, 12));
+  vst1q_u8(dst + 9 * stride, vextq_u8(col0_odd, d1, 12));
+  vst1q_u8(dst + 10 * stride, vextq_u8(col0_even, d0, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(col0_odd, d1, 11));
+  vst1q_u8(dst + 12 * stride, vextq_u8(col0_even, d0, 10));
+  vst1q_u8(dst + 13 * stride, vextq_u8(col0_odd, d1, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(col0_even, d0, 9));
+  vst1q_u8(dst + 15 * stride, vextq_u8(col0_odd, d1, 9));
+}
+
+void vpx_d117_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d117_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, d0_lo, d0_hi, d1_lo, d1_hi, l0, l1,
+      l15, l16, l17, azl0, col0_lo, col0_hi, col0_even, col0_odd;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(az, a0);
+  d0_hi = vrhaddq_u8(a15, a16);
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The last lane of col0_hi is unused here.
+  col0_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  col0_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  col0_lo = vrev64q_u8(vextq_u8(col0_lo, col0_lo, 8));
+  col0_hi = vrev64q_u8(vextq_u8(col0_hi, col0_hi, 8));
+
+  // The first lane of these are unused since they are only ever called as
+  // ext(col0, _, i) where i >= 1.
+  col0_even = vuzpq_u8(col0_hi, col0_lo).val[1];
+  col0_odd = vuzpq_u8(col0_hi, col0_lo).val[0];
+
+  vst1q_u8(dst + 0 * stride + 0, d0_lo);
+  vst1q_u8(dst + 0 * stride + 16, d0_hi);
+  vst1q_u8(dst + 1 * stride + 0, d1_lo);
+  vst1q_u8(dst + 1 * stride + 16, d1_hi);
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(col0_even, d0_lo, 15));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d0_lo, d0_hi, 15));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(col0_odd, d1_lo, 15));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(col0_even, d0_lo, 14));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d0_lo, d0_hi, 14));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(col0_odd, d1_lo, 14));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 14));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(col0_even, d0_lo, 13));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d0_lo, d0_hi, 13));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(col0_odd, d1_lo, 13));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(col0_even, d0_lo, 12));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d0_lo, d0_hi, 12));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(col0_odd, d1_lo, 12));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d1_lo, d1_hi, 12));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(col0_even, d0_lo, 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d0_lo, d0_hi, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(col0_odd, d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(col0_even, d0_lo, 10));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d0_lo, d0_hi, 10));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(col0_odd, d1_lo, 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d1_lo, d1_hi, 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(col0_even, d0_lo, 9));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d0_lo, d0_hi, 9));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(col0_odd, d1_lo, 9));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(col0_even, d0_lo, 8));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d0_lo, d0_hi, 8));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(col0_odd, d1_lo, 8));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d1_lo, d1_hi, 8));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(col0_even, d0_lo, 7));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d0_lo, d0_hi, 7));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(col0_odd, d1_lo, 7));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(col0_even, d0_lo, 6));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d0_lo, d0_hi, 6));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(col0_odd, d1_lo, 6));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d1_lo, d1_hi, 6));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(col0_even, d0_lo, 5));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d0_lo, d0_hi, 5));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(col0_odd, d1_lo, 5));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(col0_even, d0_lo, 4));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d0_lo, d0_hi, 4));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(col0_odd, d1_lo, 4));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d1_lo, d1_hi, 4));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(col0_even, d0_lo, 3));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d0_lo, d0_hi, 3));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(col0_odd, d1_lo, 3));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(col0_even, d0_lo, 2));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d0_lo, d0_hi, 2));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(col0_odd, d1_lo, 2));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d1_lo, d1_hi, 2));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(col0_even, d0_lo, 1));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d0_lo, d0_hi, 1));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(col0_odd, d1_lo, 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
 }
 
 // -----------------------------------------------------------------------------
@@ -390,22 +866,14 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x8_t L3210 = vrev64_u8(L0123);
   const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4);
   const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5);
-  const uint8x8_t L10XA0123_ =
-      vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8));
+  const uint8x8_t L10XA0123_ = vext_u8(L210XA0123, L210XA0123, 1);
   const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012);
   const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123);
-  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
-  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
-  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
-  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
-  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
-  vst1_lane_u32((uint32_t *)dst, r0, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r1, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r2, 0);
-  dst += stride;
-  vst1_lane_u32((uint32_t *)dst, r3, 0);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(avg2, avg2, 3));
+  store_u8_4x1(dst + 1 * stride, vext_u8(avg2, avg2, 2));
+  store_u8_4x1(dst + 2 * stride, vext_u8(avg2, avg2, 1));
+  store_u8_4x1(dst + 3 * stride, avg2);
 }
 
 void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
@@ -422,31 +890,15 @@ void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_);
   const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_);
   const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567);
-  const uint8x8_t row_0 = vget_low_u8(row);
-  const uint8x8_t row_1 = vget_high_u8(row);
-  const uint8x8_t r0 = vext_u8(row_0, row_1, 7);
-  const uint8x8_t r1 = vext_u8(row_0, row_1, 6);
-  const uint8x8_t r2 = vext_u8(row_0, row_1, 5);
-  const uint8x8_t r3 = vext_u8(row_0, row_1, 4);
-  const uint8x8_t r4 = vext_u8(row_0, row_1, 3);
-  const uint8x8_t r5 = vext_u8(row_0, row_1, 2);
-  const uint8x8_t r6 = vext_u8(row_0, row_1, 1);
-
-  vst1_u8(dst, r0);
-  dst += stride;
-  vst1_u8(dst, r1);
-  dst += stride;
-  vst1_u8(dst, r2);
-  dst += stride;
-  vst1_u8(dst, r3);
-  dst += stride;
-  vst1_u8(dst, r4);
-  dst += stride;
-  vst1_u8(dst, r5);
-  dst += stride;
-  vst1_u8(dst, r6);
-  dst += stride;
-  vst1_u8(dst, row_0);
+
+  vst1_u8(dst + 0 * stride, vget_low_u8(vextq_u8(row, row, 7)));
+  vst1_u8(dst + 1 * stride, vget_low_u8(vextq_u8(row, row, 6)));
+  vst1_u8(dst + 2 * stride, vget_low_u8(vextq_u8(row, row, 5)));
+  vst1_u8(dst + 3 * stride, vget_low_u8(vextq_u8(row, row, 4)));
+  vst1_u8(dst + 4 * stride, vget_low_u8(vextq_u8(row, row, 3)));
+  vst1_u8(dst + 5 * stride, vget_low_u8(vextq_u8(row, row, 2)));
+  vst1_u8(dst + 6 * stride, vget_low_u8(vextq_u8(row, row, 1)));
+  vst1_u8(dst + 7 * stride, vget_low_u8(row));
 }
 
 static INLINE void d135_store_16x8(
@@ -489,6 +941,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_);
   const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X);
   const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef);
+
   const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15);
   const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14);
   const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13);
@@ -496,7 +949,7 @@ void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
   const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11);
   const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10);
   const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9);
-  const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1));
+  const uint8x16_t r_7 = vextq_u8(row_0, row_1, 8);
   const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7);
   const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6);
   const uint8x16_t r_a = vextq_u8(row_0, row_1, 5);
@@ -667,6 +1120,454 @@ void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
   d135_store_32x2(&dst, stride, row_0, row_1, row_2);
 }
 
+// -----------------------------------------------------------------------------
+
+void vpx_d153_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02;
+
+  az = load_unaligned_u8_4x1(above - 1);
+  a0 = load_unaligned_u8_4x1(above + 0);
+  // [ left[0], above[-1], above[0], above[1], x, x, x, x ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = load_unaligned_u8_4x1(left + 0);
+  l1 = load_unaligned_u8_4x1(left + 1);
+  // [ above[-1], left[0], left[1], left[2], x, x, x, x ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  d0 = vrhadd_u8(azl0, l0);
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  d02 = vrev64_u8(vzip_u8(d0, d2).val[0]);
+
+  store_u8_4x1(dst + 0 * stride, vext_u8(d02, d1, 7));
+  store_u8_4x1(dst + 1 * stride, vext_u8(d02, d1, 5));
+  store_u8_4x1(dst + 2 * stride, vext_u8(d02, d1, 3));
+  store_u8_4x1(dst + 3 * stride, vext_u8(d02, d1, 1));
+}
+
+void vpx_d153_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1_u8(above - 1);
+  a0 = vld1_u8(above + 0);
+  // [ left[0], above[-1], ... , above[5] ]
+  l0az = vext_u8(vld1_dup_u8(left), az, 7);
+
+  l0 = vld1_u8(left);
+  // The last lane here is unused, reading left[8] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[7], x ]
+  l1 = vext_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[6] ]
+  azl0 = vext_u8(vld1_dup_u8(above - 1), l0, 7);
+
+  // d0[0] = AVG2(above[-1], left[0])
+  // d0[1] = AVG2(left[0], left[1])
+  // ...
+  // d0[7] = AVG2(left[6], left[7])
+  d0 = vrhadd_u8(azl0, l0);
+
+  // d1[0] = AVG3(left[0], above[-1], above[0])
+  // d1[1] = AVG3(above[-1], above[0], above[1])
+  // ...
+  // d1[7] = AVG3(above[5], above[6], above[7])
+  d1 = vrhadd_u8(vhadd_u8(l0az, a0), az);
+
+  // d2[0] = AVG3(above[-1], left[0], left[1])
+  // d2[1] = AVG3(left[0], left[1], left[2])
+  // ...
+  // d2[6] = AVG3(left[5], left[6], left[7])
+  // d2[7] = x (don't care)
+  d2 = vrhadd_u8(vhadd_u8(azl0, l1), l0);
+
+  // The ext instruction shifts elements in from the end of the vector rather
+  // than the start, so reverse the vectors to put the elements to be shifted
+  // in at the end. The lowest lane of d02_lo is unused.
+  d02_lo = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[0];
+  d02_hi = vzip_u8(vrev64_u8(d2), vrev64_u8(d0)).val[1];
+
+  // Incrementally shift more elements from d0/d2 reversed into d1:
+  // stride=0 [ d0[0], d1[0], d1[1], d1[2], d1[3], d1[4], d1[5], d1[6] ]
+  // stride=1 [ d0[1], d2[0], d0[0], d1[0], d1[1], d1[2], d1[3], d1[4] ]
+  // stride=2 [ d0[2], d2[1], d0[1], d2[0], d0[0], d1[0], d1[1], d1[2] ]
+  // stride=3 [ d0[3], d2[2], d0[2], d2[1], d0[1], d2[0], d0[0], d1[0] ]
+  // stride=4 [ d0[4], d2[3], d0[3], d2[2], d0[2], d2[1], d0[1], d2[0] ]
+  // stride=5 [ d0[5], d2[4], d0[4], d2[3], d0[3], d2[2], d0[2], d2[1] ]
+  // stride=6 [ d0[6], d2[5], d0[5], d2[4], d0[4], d2[3], d0[3], d2[2] ]
+  // stride=7 [ d0[7], d2[6], d0[6], d2[5], d0[5], d2[4], d0[4], d2[3] ]
+  vst1_u8(dst + 0 * stride, vext_u8(d02_hi, d1, 7));
+  vst1_u8(dst + 1 * stride, vext_u8(d02_hi, d1, 5));
+  vst1_u8(dst + 2 * stride, vext_u8(d02_hi, d1, 3));
+  vst1_u8(dst + 3 * stride, vext_u8(d02_hi, d1, 1));
+  vst1_u8(dst + 4 * stride, vext_u8(d02_lo, d02_hi, 7));
+  vst1_u8(dst + 5 * stride, vext_u8(d02_lo, d02_hi, 5));
+  vst1_u8(dst + 6 * stride, vext_u8(d02_lo, d02_hi, 3));
+  vst1_u8(dst + 7 * stride, vext_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, l0az, l0, l1, azl0, d0, d1, d2, d02_lo, d02_hi;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left + 0);
+  // The last lane here is unused, reading left[16] could cause a buffer
+  // over-read, so just fill with a duplicate of left[0] to avoid needing to
+  // materialize a zero:
+  // [ left[1], ... , left[15], x ]
+  l1 = vextq_u8(l0, l0, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0 = vrhaddq_u8(azl0, l0);
+  d1 = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d2 = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+
+  d0 = vrev64q_u8(vextq_u8(d0, d0, 8));
+  d2 = vrev64q_u8(vextq_u8(d2, d2, 8));
+
+  // The lowest lane of d02_lo is unused.
+  d02_lo = vzipq_u8(d2, d0).val[0];
+  d02_hi = vzipq_u8(d2, d0).val[1];
+
+  vst1q_u8(dst + 0 * stride, vextq_u8(d02_hi, d1, 15));
+  vst1q_u8(dst + 1 * stride, vextq_u8(d02_hi, d1, 13));
+  vst1q_u8(dst + 2 * stride, vextq_u8(d02_hi, d1, 11));
+  vst1q_u8(dst + 3 * stride, vextq_u8(d02_hi, d1, 9));
+  vst1q_u8(dst + 4 * stride, vextq_u8(d02_hi, d1, 7));
+  vst1q_u8(dst + 5 * stride, vextq_u8(d02_hi, d1, 5));
+  vst1q_u8(dst + 6 * stride, vextq_u8(d02_hi, d1, 3));
+  vst1q_u8(dst + 7 * stride, vextq_u8(d02_hi, d1, 1));
+  vst1q_u8(dst + 8 * stride, vextq_u8(d02_lo, d02_hi, 15));
+  vst1q_u8(dst + 9 * stride, vextq_u8(d02_lo, d02_hi, 13));
+  vst1q_u8(dst + 10 * stride, vextq_u8(d02_lo, d02_hi, 11));
+  vst1q_u8(dst + 11 * stride, vextq_u8(d02_lo, d02_hi, 9));
+  vst1q_u8(dst + 12 * stride, vextq_u8(d02_lo, d02_hi, 7));
+  vst1q_u8(dst + 13 * stride, vextq_u8(d02_lo, d02_hi, 5));
+  vst1q_u8(dst + 14 * stride, vextq_u8(d02_lo, d02_hi, 3));
+  vst1q_u8(dst + 15 * stride, vextq_u8(d02_lo, d02_hi, 1));
+}
+
+void vpx_d153_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  // See vpx_d153_predictor_8x8_neon for more details on the implementation.
+  uint8x16_t az, a0, a14, a15, a16, l0az, l0, l1, l15, l16, l17, azl0, d0_lo,
+      d0_hi, d1_lo, d1_hi, d2_lo, d2_hi;
+  uint8x16x2_t d02_hi, d02_lo;
+
+  az = vld1q_u8(above - 1);
+  a0 = vld1q_u8(above + 0);
+  a14 = vld1q_u8(above + 14);
+  a15 = vld1q_u8(above + 15);
+  a16 = vld1q_u8(above + 16);
+  // [ left[0], above[-1], ... , above[13] ]
+  l0az = vextq_u8(vld1q_dup_u8(left), az, 15);
+
+  l0 = vld1q_u8(left);
+  l1 = vld1q_u8(left + 1);
+  l15 = vld1q_u8(left + 15);
+  l16 = vld1q_u8(left + 16);
+  // The last lane here is unused, reading left[32] would cause a buffer
+  // over-read (observed as an address-sanitizer failure), so just fill with a
+  // duplicate of left[16] to avoid needing to materialize a zero:
+  // [ left[17], ... , left[31], x ]
+  l17 = vextq_u8(l16, l16, 1);
+  // [ above[-1], left[0], ... , left[14] ]
+  azl0 = vextq_u8(vld1q_dup_u8(above - 1), l0, 15);
+
+  d0_lo = vrhaddq_u8(azl0, l0);
+  d0_hi = vrhaddq_u8(l15, l16);
+
+  d1_lo = vrhaddq_u8(vhaddq_u8(l0az, a0), az);
+  d1_hi = vrhaddq_u8(vhaddq_u8(a14, a16), a15);
+
+  // The highest lane of d2_hi is unused.
+  d2_lo = vrhaddq_u8(vhaddq_u8(azl0, l1), l0);
+  d2_hi = vrhaddq_u8(vhaddq_u8(l15, l17), l16);
+
+  d0_lo = vrev64q_u8(vextq_u8(d0_lo, d0_lo, 8));
+  d0_hi = vrev64q_u8(vextq_u8(d0_hi, d0_hi, 8));
+
+  d2_lo = vrev64q_u8(vextq_u8(d2_lo, d2_lo, 8));
+  d2_hi = vrev64q_u8(vextq_u8(d2_hi, d2_hi, 8));
+
+  // d02_hi.val[0][0] is unused here.
+  d02_hi = vzipq_u8(d2_hi, d0_hi);
+  d02_lo = vzipq_u8(d2_lo, d0_lo);
+
+  vst1q_u8(dst + 0 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 0 * stride + 16, vextq_u8(d1_lo, d1_hi, 15));
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(d1_lo, d1_hi, 13));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(d1_lo, d1_hi, 11));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(d1_lo, d1_hi, 9));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(d1_lo, d1_hi, 7));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(d1_lo, d1_hi, 5));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(d1_lo, d1_hi, 3));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(d1_lo, d1_hi, 1));
+  vst1q_u8(dst + 8 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 8 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 15));
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 13));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 11));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 9));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 7));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 5));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 3));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(d02_lo.val[1], d1_lo, 1));
+  vst1q_u8(dst + 16 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 16 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 15));
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 13));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 11));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 9));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 7));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 5));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 3));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(d02_lo.val[0], d02_lo.val[1], 1));
+  vst1q_u8(dst + 24 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 15));
+  vst1q_u8(dst + 24 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 15));
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 13));
+  vst1q_u8(dst + 25 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 13));
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 11));
+  vst1q_u8(dst + 26 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 11));
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 9));
+  vst1q_u8(dst + 27 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 9));
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 7));
+  vst1q_u8(dst + 28 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 7));
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 5));
+  vst1q_u8(dst + 29 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 5));
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 3));
+  vst1q_u8(dst + 30 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 3));
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(d02_hi.val[0], d02_hi.val[1], 1));
+  vst1q_u8(dst + 31 * stride + 16, vextq_u8(d02_hi.val[1], d02_lo.val[0], 1));
+}
+
+// -----------------------------------------------------------------------------
+
+void vpx_d207_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l0, l3, l1, l2, c0, c1, c01, d0, d1;
+  (void)above;
+
+  // We need the low half lanes here for the c0/c1 arithmetic but the high half
+  // lanes for the ext:
+  // [ left[0], left[1], left[2], left[3], left[0], left[1], left[2], left[3] ]
+  l0 = load_replicate_u8_4x1(left + 0);
+  l3 = vld1_dup_u8(left + 3);
+
+  // [ left[1], left[2], left[3], left[3], x, x, x, x ]
+  l1 = vext_u8(l0, l3, 5);
+  // [ left[2], left[3], left[3], left[3], x, x, x, x ]
+  l2 = vext_u8(l0, l3, 6);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  // [ c0[0], c1[0], c0[1], c1[1], c0[2], c1[2], c0[3], c1[3] ]
+  c01 = vzip_u8(c0, c1).val[0];
+
+  d0 = c01;
+  d1 = vext_u8(c01, l3, 2);
+
+  // Store the high half of the vector for stride={2,3} to avoid needing
+  // additional ext instructions:
+  // stride=0 [ c0[0], c1[0],   c0[1],   c1[1] ]
+  // stride=1 [ c0[1], c1[1],   c0[2],   c1[2] ]
+  // stride=2 [ c0[2], c1[2],   c0[3],   c1[3] ]
+  // stride=3 [ c0[3], c1[3], left[3], left[3] ]
+  store_u8_4x1(dst + 0 * stride, d0);
+  store_u8_4x1(dst + 1 * stride, d1);
+  store_u8_4x1_high(dst + 2 * stride, d0);
+  store_u8_4x1_high(dst + 3 * stride, d1);
+}
+
+void vpx_d207_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint8x8_t l7, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1_u8(left + 0);
+  l7 = vld1_dup_u8(left + 7);
+
+  // [ left[1], left[2], left[3], left[4], left[5], left[6], left[7], left[7] ]
+  l1 = vext_u8(l0, l7, 1);
+  // [ left[2], left[3], left[4], left[5], left[6], left[7], left[7], left[7] ]
+  l2 = vext_u8(l0, l7, 2);
+
+  c0 = vrhadd_u8(l0, l1);
+  c1 = vrhadd_u8(vhadd_u8(l0, l2), l1);
+
+  c01_lo = vzip_u8(c0, c1).val[0];
+  c01_hi = vzip_u8(c0, c1).val[1];
+
+  vst1_u8(dst + 0 * stride, c01_lo);
+  vst1_u8(dst + 1 * stride, vext_u8(c01_lo, c01_hi, 2));
+  vst1_u8(dst + 2 * stride, vext_u8(c01_lo, c01_hi, 4));
+  vst1_u8(dst + 3 * stride, vext_u8(c01_lo, c01_hi, 6));
+  vst1_u8(dst + 4 * stride, c01_hi);
+  vst1_u8(dst + 5 * stride, vext_u8(c01_hi, l7, 2));
+  vst1_u8(dst + 6 * stride, vext_u8(c01_hi, l7, 4));
+  vst1_u8(dst + 7 * stride, vext_u8(c01_hi, l7, 6));
+}
+
+void vpx_d207_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l15, l0, l1, l2, c0, c1, c01_lo, c01_hi;
+  (void)above;
+
+  l0 = vld1q_u8(left + 0);
+  l15 = vld1q_dup_u8(left + 15);
+
+  l1 = vextq_u8(l0, l15, 1);
+  l2 = vextq_u8(l0, l15, 2);
+
+  c0 = vrhaddq_u8(l0, l1);
+  c1 = vrhaddq_u8(vhaddq_u8(l0, l2), l1);
+
+  c01_lo = vzipq_u8(c0, c1).val[0];
+  c01_hi = vzipq_u8(c0, c1).val[1];
+
+  vst1q_u8(dst + 0 * stride, c01_lo);
+  vst1q_u8(dst + 1 * stride, vextq_u8(c01_lo, c01_hi, 2));
+  vst1q_u8(dst + 2 * stride, vextq_u8(c01_lo, c01_hi, 4));
+  vst1q_u8(dst + 3 * stride, vextq_u8(c01_lo, c01_hi, 6));
+  vst1q_u8(dst + 4 * stride, vextq_u8(c01_lo, c01_hi, 8));
+  vst1q_u8(dst + 5 * stride, vextq_u8(c01_lo, c01_hi, 10));
+  vst1q_u8(dst + 6 * stride, vextq_u8(c01_lo, c01_hi, 12));
+  vst1q_u8(dst + 7 * stride, vextq_u8(c01_lo, c01_hi, 14));
+  vst1q_u8(dst + 8 * stride, c01_hi);
+  vst1q_u8(dst + 9 * stride, vextq_u8(c01_hi, l15, 2));
+  vst1q_u8(dst + 10 * stride, vextq_u8(c01_hi, l15, 4));
+  vst1q_u8(dst + 11 * stride, vextq_u8(c01_hi, l15, 6));
+  vst1q_u8(dst + 12 * stride, vextq_u8(c01_hi, l15, 8));
+  vst1q_u8(dst + 13 * stride, vextq_u8(c01_hi, l15, 10));
+  vst1q_u8(dst + 14 * stride, vextq_u8(c01_hi, l15, 12));
+  vst1q_u8(dst + 15 * stride, vextq_u8(c01_hi, l15, 14));
+}
+
+void vpx_d207_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t l0_lo, l0_hi, l1_lo, l1_hi, l2_lo, l2_hi, l31, c0_lo, c0_hi, c1_lo,
+      c1_hi, c01[4];
+  (void)above;
+
+  l0_lo = vld1q_u8(left + 0);
+  l0_hi = vld1q_u8(left + 16);
+  l31 = vld1q_dup_u8(left + 31);
+
+  l1_lo = vextq_u8(l0_lo, l0_hi, 1);
+  l1_hi = vextq_u8(l0_hi, l31, 1);
+  l2_lo = vextq_u8(l0_lo, l0_hi, 2);
+  l2_hi = vextq_u8(l0_hi, l31, 2);
+
+  c0_lo = vrhaddq_u8(l0_lo, l1_lo);
+  c0_hi = vrhaddq_u8(l0_hi, l1_hi);
+  c1_lo = vrhaddq_u8(vhaddq_u8(l0_lo, l2_lo), l1_lo);
+  c1_hi = vrhaddq_u8(vhaddq_u8(l0_hi, l2_hi), l1_hi);
+
+  c01[0] = vzipq_u8(c0_lo, c1_lo).val[0];
+  c01[1] = vzipq_u8(c0_lo, c1_lo).val[1];
+  c01[2] = vzipq_u8(c0_hi, c1_hi).val[0];
+  c01[3] = vzipq_u8(c0_hi, c1_hi).val[1];
+
+  vst1q_u8(dst + 0 * stride + 0, c01[0]);
+  vst1q_u8(dst + 0 * stride + 16, c01[1]);
+  vst1q_u8(dst + 1 * stride + 0, vextq_u8(c01[0], c01[1], 2));
+  vst1q_u8(dst + 1 * stride + 16, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 2 * stride + 0, vextq_u8(c01[0], c01[1], 4));
+  vst1q_u8(dst + 2 * stride + 16, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 3 * stride + 0, vextq_u8(c01[0], c01[1], 6));
+  vst1q_u8(dst + 3 * stride + 16, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 4 * stride + 0, vextq_u8(c01[0], c01[1], 8));
+  vst1q_u8(dst + 4 * stride + 16, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 5 * stride + 0, vextq_u8(c01[0], c01[1], 10));
+  vst1q_u8(dst + 5 * stride + 16, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 6 * stride + 0, vextq_u8(c01[0], c01[1], 12));
+  vst1q_u8(dst + 6 * stride + 16, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 7 * stride + 0, vextq_u8(c01[0], c01[1], 14));
+  vst1q_u8(dst + 7 * stride + 16, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 8 * stride + 0, c01[1]);
+  vst1q_u8(dst + 8 * stride + 16, c01[2]);
+  vst1q_u8(dst + 9 * stride + 0, vextq_u8(c01[1], c01[2], 2));
+  vst1q_u8(dst + 9 * stride + 16, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 10 * stride + 0, vextq_u8(c01[1], c01[2], 4));
+  vst1q_u8(dst + 10 * stride + 16, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 11 * stride + 0, vextq_u8(c01[1], c01[2], 6));
+  vst1q_u8(dst + 11 * stride + 16, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 12 * stride + 0, vextq_u8(c01[1], c01[2], 8));
+  vst1q_u8(dst + 12 * stride + 16, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 13 * stride + 0, vextq_u8(c01[1], c01[2], 10));
+  vst1q_u8(dst + 13 * stride + 16, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 14 * stride + 0, vextq_u8(c01[1], c01[2], 12));
+  vst1q_u8(dst + 14 * stride + 16, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 15 * stride + 0, vextq_u8(c01[1], c01[2], 14));
+  vst1q_u8(dst + 15 * stride + 16, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 16 * stride + 0, c01[2]);
+  vst1q_u8(dst + 16 * stride + 16, c01[3]);
+  vst1q_u8(dst + 17 * stride + 0, vextq_u8(c01[2], c01[3], 2));
+  vst1q_u8(dst + 17 * stride + 16, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 18 * stride + 0, vextq_u8(c01[2], c01[3], 4));
+  vst1q_u8(dst + 18 * stride + 16, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 19 * stride + 0, vextq_u8(c01[2], c01[3], 6));
+  vst1q_u8(dst + 19 * stride + 16, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 20 * stride + 0, vextq_u8(c01[2], c01[3], 8));
+  vst1q_u8(dst + 20 * stride + 16, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 21 * stride + 0, vextq_u8(c01[2], c01[3], 10));
+  vst1q_u8(dst + 21 * stride + 16, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 22 * stride + 0, vextq_u8(c01[2], c01[3], 12));
+  vst1q_u8(dst + 22 * stride + 16, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 23 * stride + 0, vextq_u8(c01[2], c01[3], 14));
+  vst1q_u8(dst + 23 * stride + 16, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 24 * stride + 0, c01[3]);
+  vst1q_u8(dst + 24 * stride + 16, l31);
+  vst1q_u8(dst + 25 * stride + 0, vextq_u8(c01[3], l31, 2));
+  vst1q_u8(dst + 25 * stride + 16, l31);
+  vst1q_u8(dst + 26 * stride + 0, vextq_u8(c01[3], l31, 4));
+  vst1q_u8(dst + 26 * stride + 16, l31);
+  vst1q_u8(dst + 27 * stride + 0, vextq_u8(c01[3], l31, 6));
+  vst1q_u8(dst + 27 * stride + 16, l31);
+  vst1q_u8(dst + 28 * stride + 0, vextq_u8(c01[3], l31, 8));
+  vst1q_u8(dst + 28 * stride + 16, l31);
+  vst1q_u8(dst + 29 * stride + 0, vextq_u8(c01[3], l31, 10));
+  vst1q_u8(dst + 29 * stride + 16, l31);
+  vst1q_u8(dst + 30 * stride + 0, vextq_u8(c01[3], l31, 12));
+  vst1q_u8(dst + 30 * stride + 16, l31);
+  vst1q_u8(dst + 31 * stride + 0, vextq_u8(c01[3], l31, 14));
+  vst1q_u8(dst + 31 * stride + 16, l31);
+}
+
+// -----------------------------------------------------------------------------
+
 #if !HAVE_NEON_ASM
 
 void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 19cfc7c7f..586bfb85a 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -102,6 +102,16 @@ static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
 #endif
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void store_s32q_to_tran_low(tran_low_t *buf, const int32x4_t a) {
+  vst1q_s32(buf, a);
+}
+
+static INLINE int32x4_t load_tran_low_to_s32q(const tran_low_t *buf) {
+  return vld1q_s32(buf);
+}
+#endif
+
 // Propagate type information to the compiler. Without this the compiler may
 // assume the required alignment of uint32_t (4 bytes) and add alignment hints
 // to the memory access.
@@ -112,6 +122,34 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
   memcpy(buf, &a, 4);
 }
 
+// Load 4 contiguous bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 contiguous bytes and replicate across a vector when alignment is not
+// guaranteed.
+static INLINE uint8x8_t load_replicate_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  return vreinterpret_u8_u32(vdup_n_u32(a));
+}
+
+// Store 4 contiguous bytes from the low half of an 8x8 vector.
+static INLINE void store_u8_4x1(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 0);
+}
+
+// Store 4 contiguous bytes from the high half of an 8x8 vector.
+static INLINE void store_u8_4x1_high(uint8_t *buf, uint8x8_t a) {
+  vst1_lane_u32((uint32_t *)buf, vreinterpret_u32_u8(a), 1);
+}
+
 // Load 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
@@ -126,6 +164,29 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
   return vreinterpret_u8_u32(a_u32);
 }
 
+// Load 8 bytes when alignment is not guaranteed.
+static INLINE uint16x4_t load_unaligned_u16(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
+// Load 2 sets of 8 bytes when alignment is not guaranteed.
+static INLINE uint16x8_t load_unaligned_u16q(const uint16_t *buf,
+                                             ptrdiff_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64;
+  if (stride == 4) return vld1q_u16(buf);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vdupq_n_u64(a);
+  memcpy(&a, buf, 8);
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
 // Store 2 sets of 4 bytes when alignment is not guaranteed.
 static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
                                       const uint8x8_t a) {
@@ -202,6 +263,16 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
 
+static INLINE void store_u8_8x3(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+}
+
 static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
                                uint8x8_t *const s0, uint8x8_t *const s1,
                                uint8x8_t *const s2, uint8x8_t *const s3) {
@@ -226,6 +297,16 @@ static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
   vst1_u8(s, s3);
 }
 
+static INLINE void load_u8_16x3(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+}
+
 static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
                                 uint8x16_t *const s0, uint8x16_t *const s1,
                                 uint8x16_t *const s2, uint8x16_t *const s3) {
@@ -358,4 +439,25 @@ static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
   vst1q_u8(s, s7);
 }
 
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index 9c227d560..e2351fa2c 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -14,6 +14,8 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/arm/mem_neon.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
@@ -69,20 +71,19 @@ quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 }
 
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
+  int16_t const *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vld1q_s16(zbin_ptr);
-  int16x8_t round = vld1q_s16(round_ptr);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vld1q_s16(mb_plane->zbin);
+  int16x8_t round = vld1q_s16(mb_plane->round);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -132,7 +133,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -142,10 +143,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)scan;
+#endif  // VPX_ARCH_AARCH64
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -213,23 +211,21 @@ quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
 
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct ScanOrder *const scan_order) {
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
+  const int16_t *iscan = scan_order->iscan;
 
   // Only the first element of each vector is DC.
-  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  int16x8_t quant = vld1q_s16(quant_ptr);
-  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(mb_plane->zbin), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(mb_plane->round), 1);
+  int16x8_t quant = vld1q_s16(mb_plane->quant);
+  int16x8_t quant_shift = vld1q_s16(mb_plane->quant_shift);
   int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
@@ -276,7 +272,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -286,9 +282,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
-  // Need these here, else the compiler complains about mixing declarations and
-  // code in C90
-  (void)n_coeffs;
-  (void)scan;
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 5fc621aee..713eec7a9 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -17,633 +17,212 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
-                                                 const void *const buf1) {
-  uint32_t a;
-  uint32x2_t aa;
-  memcpy(&a, buf0, 4);
-  aa = vdup_n_u32(a);
-  memcpy(&a, buf1, 4);
-  aa = vset_lane_u32(a, aa, 1);
-  return vreinterpret_u8_u32(aa);
-}
-
-static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
-                            const uint8_t *const ref_array[4],
-                            const int ref_stride, const int height,
-                            uint32_t sad_array[4]) {
-  int i;
-  uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
-#if !defined(__aarch64__)
-  uint16x4_t a[2];
-#endif
-  uint32x4_t r;
-
-  assert(!((intptr_t)src_ptr % sizeof(uint32_t)));
-  assert(!(src_stride % sizeof(uint32_t)));
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vreinterpret_u8_u32(
-        vld1_dup_u32((const uint32_t *)(src_ptr + i * src_stride)));
-    const uint8x8_t ref01 = load_unaligned_2_buffers(
-        ref_array[0] + i * ref_stride, ref_array[1] + i * ref_stride);
-    const uint8x8_t ref23 = load_unaligned_2_buffers(
-        ref_array[2] + i * ref_stride, ref_array[3] + i * ref_stride);
-    abs[0] = vabal_u8(abs[0], s, ref01);
-    abs[1] = vabal_u8(abs[1], s, ref23);
-  }
-
-#if defined(__aarch64__)
-  abs[0] = vpaddq_u16(abs[0], abs[1]);
-  r = vpaddlq_u16(abs[0]);
-#else
-  a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
-  a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
-  r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
-#endif
-  vst1q_u32(sad_array, r);
-}
-
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
-}
-
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
-                                          uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint16x8_t b0 = vpaddq_u16(a0, a1);
-  const uint32x4_t r = vpaddlq_u16(b0);
-#else
-  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint16x4_t b0 = vpadd_u16(a0, a1);
-  const uint16x4_t b1 = vpadd_u16(a2, a3);
-  const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
-#endif
-  vst1q_u32(sad_array, r);
-}
-
-#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
-
-// Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
-  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
-  const uint32x4_t b0 = vpaddlq_u16(a0);
-  const uint32x4_t b1 = vpaddlq_u16(a1);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
-  const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
-  const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
-  const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
-  const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
-  const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
-  const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
-}
-
-// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t b0 = vpaddq_u32(a0, a1);
-  const uint32x4_t b1 = vpaddq_u32(a2, a3);
-  const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
-  const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
-  const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
-  const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
-  const uint32x2_t c0 = vpadd_u32(b0, b1);
-  const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(sad_array, vcombine_u32(c0, c1));
-#endif
-}
-
-// Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
-                                           uint32_t sad_array[4]) {
-#if defined(__aarch64__)
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x4_t c0 = vpaddq_u32(b0, b1);
-  const uint32x4_t c1 = vpaddq_u32(b2, b3);
-  const uint32x4_t r = vpaddq_u32(c0, c1);
-  vst1q_u32(sad_array, r);
-#else
-  const uint32x4_t a0 = vpaddlq_u16(sum[0]);
-  const uint32x4_t a1 = vpaddlq_u16(sum[1]);
-  const uint32x4_t a2 = vpaddlq_u16(sum[2]);
-  const uint32x4_t a3 = vpaddlq_u16(sum[3]);
-  const uint32x4_t a4 = vpaddlq_u16(sum[4]);
-  const uint32x4_t a5 = vpaddlq_u16(sum[5]);
-  const uint32x4_t a6 = vpaddlq_u16(sum[6]);
-  const uint32x4_t a7 = vpaddlq_u16(sum[7]);
-  const uint32x4_t b0 = vaddq_u32(a0, a1);
-  const uint32x4_t b1 = vaddq_u32(a2, a3);
-  const uint32x4_t b2 = vaddq_u32(a4, a5);
-  const uint32x4_t b3 = vaddq_u32(a6, a7);
-  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
-  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
-  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
-  const uint32x2_t d0 = vpadd_u32(c0, c1);
-  const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(sad_array, vcombine_u32(d0, d1));
-#endif
-}
-
-#endif
-
-static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
-                            const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t sad_array[4], const int height) {
-  int i, j;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    i++;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_long_add_4d_uint16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t s = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    for (j = 0; j < 4; ++j) {
-      const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
-      ref_loop[j] += ref_stride;
-      sum[j] = vabal_u8(sum[j], s, b_u8);
-    }
-  }
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
 
-  sad_512_pel_final_neon(sum, sad_array);
-}
+    i++;
+  } while (i < h);
 
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                        const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
-}
-
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint32x4_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  const uint8x16_t diff = vabdq_u8(src_ptr, r);
-  *sum = vdotq_u32(*sum, diff, vdupq_n_u8(1));
-}
-
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr + i * src_stride);
-    sad16_neon(ref_loop[0] + i * ref_stride, s, &sum[0]);
-    sad16_neon(ref_loop[1] + i * ref_stride, s, &sum[1]);
-    sad16_neon(ref_loop[2] + i * ref_stride, s, &sum[2]);
-    sad16_neon(ref_loop[3] + i * ref_stride, s, &sum[3]);
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
-                              uint16x8_t *const sum) {
-  const uint8x16_t r = vld1q_u8(ref_ptr);
-  *sum = vabal_u8(*sum, vget_low_u8(src_ptr), vget_low_u8(r));
-  *sum = vabal_u8(*sum, vget_high_u8(src_ptr), vget_high_u8(r));
+static INLINE void sad8_neon(uint8x8_t src, uint8x8_t ref,
+                             uint16x8_t *const sad_sum) {
+  uint8x8_t abs_diff = vabd_u8(src, ref);
+  *sad_sum = vaddw_u8(*sad_sum, abs_diff);
 }
 
-static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
   uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
                         vdupq_n_u16(0) };
 
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    /* Manual unrolling here stops the compiler from getting confused. */
-    sad16_neon(ref_loop[0], s, &sum[0]);
-    ref_loop[0] += ref_stride;
-    sad16_neon(ref_loop[1], s, &sum[1]);
-    ref_loop[1] += ref_stride;
-    sad16_neon(ref_loop[2], s, &sum[2]);
-    ref_loop[2] += ref_stride;
-    sad16_neon(ref_loop[3], s, &sum[3]);
-    ref_loop[3] += ref_stride;
-  }
-
-  sad_512_pel_final_neon(sum, sad_array);
-}
+  int i = 0;
+  do {
+    const uint8x8_t s = vld1_u8(src + i * src_stride);
+    sad8_neon(s, vld1_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1_u8(ref[3] + i * ref_stride), &sum[3]);
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
-}
+    i++;
+  } while (i < h);
 
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t sad_array[4], const int height) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
-}
-
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
-}
-
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
-}
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
 
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *const ref_array[4], int ref_stride,
-                             const int height, uint16x8_t *const sum) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-
-  sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
-}
+  int i = 0;
+  do {
+    uint8x8_t s = load_unaligned_u8(src + i * src_stride, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + i * ref_stride, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + i * ref_stride, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + i * ref_stride, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + i * ref_stride, ref_stride);
 
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, sad_array);
-}
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+    sad8_neon(s, r3, &sum[3]);
 
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, sad_array);
-}
+    i += 2;
+  } while (i < h);
 
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  uint16x8_t sum[4];
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, sad_array);
+  vst1q_u32(res, horizontal_add_4d_uint16x8(sum));
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-////////////////////////////////////////////////////////////////////////////////
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
-
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0) };
-
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
+#define SAD_WXH_4D_NEON(w, h)                                                 \
+  void vpx_sad##w##x##h##x4d_neon(const uint8_t *src_ptr, int src_stride,     \
+                                  const uint8_t *const ref_array[4],          \
+                                  int ref_stride, uint32_t sad_array[4]) {    \
+    sad##w##xhx4d_neon(src_ptr, src_stride, ref_array, ref_stride, sad_array, \
+                       (h));                                                  \
   }
 
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  uint32x4_t r0, r1, r2, r3;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
-                        vdupq_n_u32(0), vdupq_n_u32(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                         \
+  void vpx_sad_skip_##w##x##h##x4d_neon(                                   \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon(src_ptr, 2 * src_stride, ref_array, 2 * ref_stride, \
+                       sad_array, ((h) >> 1));                             \
+    sad_array[0] <<= 1;                                                    \
+    sad_array[1] <<= 1;                                                    \
+    sad_array[2] <<= 1;                                                    \
+    sad_array[3] <<= 1;                                                    \
   }
 
-  r0 = vpaddq_u32(sum[0], sum[1]);
-  r1 = vpaddq_u32(sum[2], sum[3]);
-  r2 = vpaddq_u32(sum[4], sum[5]);
-  r3 = vpaddq_u32(sum[6], sum[7]);
-  r0 = vpaddq_u32(r0, r1);
-  r1 = vpaddq_u32(r2, r3);
-  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
-}
-
-#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
-
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0) };
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
 
-  for (i = 0; i < 32; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
 
-  sad_2048_pel_final_neon(sum, sad_array);
-}
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
 
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
-  int i;
-  const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
-                                 ref_array[3] };
-  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
-                        vdupq_n_u16(0), vdupq_n_u16(0) };
-
-  for (i = 0; i < 64; ++i) {
-    uint8x16_t s;
-
-    s = vld1q_u8(src_ptr + 0 * 16);
-    sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 1 * 16);
-    sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
-    sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
-    sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
-    sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
-
-    s = vld1q_u8(src_ptr + 2 * 16);
-    sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
-
-    s = vld1q_u8(src_ptr + 3 * 16);
-    sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
-    sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
-    sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
-    sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
-
-    src_ptr += src_stride;
-    ref_loop[0] += ref_stride;
-    ref_loop[1] += ref_stride;
-    ref_loop[2] += ref_stride;
-    ref_loop[3] += ref_stride;
-  }
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
 
-  sad_4096_pel_final_neon(sum, sad_array);
-}
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/vpx_dsp/arm/sad4d_neon_dotprod.c b/vpx_dsp/arm/sad4d_neon_dotprod.c
new file mode 100644
index 000000000..933fc48b8
--- /dev/null
+++ b/vpx_dsp/arm/sad4d_neon_dotprod.c
@@ -0,0 +1,176 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1, s2, s3;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+    s2 = vld1q_u8(src + i * src_stride + 32);
+    sad16_neon(s2, vld1q_u8(ref[0] + i * ref_stride + 32), &sum_lo[0]);
+    sad16_neon(s2, vld1q_u8(ref[1] + i * ref_stride + 32), &sum_lo[1]);
+    sad16_neon(s2, vld1q_u8(ref[2] + i * ref_stride + 32), &sum_lo[2]);
+    sad16_neon(s2, vld1q_u8(ref[3] + i * ref_stride + 32), &sum_lo[3]);
+
+    s3 = vld1q_u8(src + i * src_stride + 48);
+    sad16_neon(s3, vld1q_u8(ref[0] + i * ref_stride + 48), &sum_hi[0]);
+    sad16_neon(s3, vld1q_u8(ref[1] + i * ref_stride + 48), &sum_hi[1]);
+    sad16_neon(s3, vld1q_u8(ref[2] + i * ref_stride + 48), &sum_hi[2]);
+    sad16_neon(s3, vld1q_u8(ref[3] + i * ref_stride + 48), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint8x16_t s0, s1;
+
+    s0 = vld1q_u8(src + i * src_stride);
+    sad16_neon(s0, vld1q_u8(ref[0] + i * ref_stride), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + i * ref_stride), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + i * ref_stride), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + i * ref_stride), &sum_lo[3]);
+
+    s1 = vld1q_u8(src + i * src_stride + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + i * ref_stride + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + i * ref_stride + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + i * ref_stride + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + i * ref_stride + 16), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    const uint8x16_t s = vld1q_u8(src + i * src_stride);
+    sad16_neon(s, vld1q_u8(ref[0] + i * ref_stride), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + i * ref_stride), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + i * ref_stride), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_uint32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                      \
+  void vpx_sad##w##x##h##x4d_neon_dotprod(                                 \
+      const uint8_t *src_ptr, int src_stride,                              \
+      const uint8_t *const ref_array[4], int ref_stride,                   \
+      uint32_t sad_array[4]) {                                             \
+    sad##w##xhx4d_neon_dotprod(src_ptr, src_stride, ref_array, ref_stride, \
+                               sad_array, (h));                            \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                             \
+  void vpx_sad_skip_##w##x##h##x4d_neon_dotprod(                       \
+      const uint8_t *src_ptr, int src_stride,                          \
+      const uint8_t *const ref_array[4], int ref_stride,               \
+      uint32_t sad_array[4]) {                                         \
+    sad##w##xhx4d_neon_dotprod(src_ptr, 2 * src_stride, ref_array,     \
+                               2 * ref_stride, sad_array, ((h) >> 1)); \
+    sad_array[0] <<= 1;                                                \
+    sad_array[1] <<= 1;                                                \
+    sad_array[2] <<= 1;                                                \
+    sad_array[3] <<= 1;                                                \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/vpx_dsp/arm/sad_neon.c b/vpx_dsp/arm/sad_neon.c
index ad575d4aa..4dd87ddc0 100644
--- a/vpx_dsp/arm/sad_neon.c
+++ b/vpx_dsp/arm/sad_neon.c
@@ -17,635 +17,375 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/sum_neon.h"
 
-uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-#if defined(__ARM_FEATURE_DOTPROD)
-  const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
-  const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
-  return horizontal_add_uint32x4(dp);
-#else
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  return horizontal_add_uint16x8(abs);
-#endif
-}
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
 
-uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-  const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-  const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-#if defined(__ARM_FEATURE_DOTPROD)
-  const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
-  const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
-  return horizontal_add_uint32x4(prod);
-#else
-  uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
-  abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  return horizontal_add_uint16x8(abs);
-#endif
-}
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
 
-uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride) {
-#if defined(__ARM_FEATURE_DOTPROD)
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t src2_u8 =
-      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
-  const uint8x16_t ref2_u8 =
-      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
-  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
-  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
-  prod = vdotq_u32(prod, sad1_u8, ones);
-  prod = vdotq_u32(prod, sad2_u8, ones);
-  return horizontal_add_uint32x4(prod);
-#else
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(ref_u8));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
-  }
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
 
-  return horizontal_add_uint16x8(abs);
-#endif
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
-                             const uint8_t *ref_ptr, int ref_stride,
-                             const uint8_t *second_pred) {
-#if defined(__ARM_FEATURE_DOTPROD)
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
-  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-  const uint8x16_t src2_u8 =
-      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
-  const uint8x16_t ref2_u8 =
-      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
-  const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
-  const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
-  const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
-  const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
-  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
-  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
-  prod = vdotq_u32(prod, sad1_u8, ones);
-  prod = vdotq_u32(prod, sad2_u8, ones);
-  return horizontal_add_uint32x4(prod);
-#else
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-  for (i = 0; i < 8; i += 4) {
-    const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(src_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
-  }
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint32x4_t sum = vdupq_n_u32(0);
 
-  return horizontal_add_uint16x8(abs);
-#endif
-}
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
 
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride,
-                               const int height) {
-  int i;
-  uint32x2_t prod = vdup_n_u32(0);
-  const uint8x8_t ones = vdup_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    prod = vdot_u32(prod, sad_u8, ones);
-  }
-  return prod;
-}
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
 
-static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   const uint8_t *second_pred,
-                                   const int height) {
-  int i;
-  uint32x2_t prod = vdup_n_u32(0);
-  const uint8x8_t ones = vdup_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t c_u8 = vld1_u8(second_pred);
-    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
-    const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 8;
-    prod = vdot_u32(prod, sad_u8, ones);
-  }
-  return prod;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sum);
 }
 
-#define SAD8XN(n)                                                            \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                               const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x2_t prod =                                                  \
-        sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x2(prod);                                    \
-  }                                                                          \
-                                                                             \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                   const uint8_t *ref_ptr, int ref_stride,   \
-                                   const uint8_t *second_pred) {             \
-    const uint32x2_t prod =                                                  \
-        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x2(prod);                                    \
-  }
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride,
-                               const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
 
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    abs = vabal_u8(abs, a_u8, b_u8);
-  }
-  return abs;
-}
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
 
-static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
-                                   const uint8_t *ref_ptr, int ref_stride,
-                                   const uint8_t *second_pred,
-                                   const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x8_t a_u8 = vld1_u8(src_ptr);
-    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t c_u8 = vld1_u8(second_pred);
-    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 8;
-    abs = vabal_u8(abs, a_u8, avg);
-  }
-  return abs;
-}
+  } while (--i != 0);
 
-#define SAD8XN(n)                                                              \
-  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *ref_ptr, int ref_stride) {       \
-    const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
-    return horizontal_add_uint16x8(abs);                                       \
-  }                                                                            \
-                                                                               \
-  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,     \
-                                   const uint8_t *ref_ptr, int ref_stride,     \
-                                   const uint8_t *second_pred) {               \
-    const uint16x8_t abs =                                                     \
-        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
-    return horizontal_add_uint16x8(abs);                                       \
-  }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
-SAD8XN(4)
-SAD8XN(8)
-SAD8XN(16)
-
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t src_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_u8, ones);
-  }
-  return prod;
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t c_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
-    const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 16;
-    prod = vdotq_u32(prod, sad_u8, ones);
-  }
-  return prod;
-}
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-#define SAD16XN(n)                                                            \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t prod =                                                   \
-        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(prod);                                     \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t prod =                                                   \
-        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(prod);                                     \
-  }
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(b_u8));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(b_u8));
-  }
-  return abs;
-}
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+
+    sum = vabal_u8(sum, s, r);
 
-static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
-    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
-    const uint8x16_t c_u8 = vld1q_u8(second_pred);
-    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 16;
-    abs = vabal_u8(abs, vget_low_u8(a_u8), vget_low_u8(avg));
-    abs = vabal_u8(abs, vget_high_u8(a_u8), vget_high_u8(avg));
-  }
-  return abs;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-#define SAD16XN(n)                                                            \
-  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint16x8(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint16x8(abs);                                      \
-  }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
-SAD16XN(8)
-SAD16XN(16)
-SAD16XN(32)
-
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
-    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_lo_u8, ones);
-    prod = vdotq_u32(prod, sad_hi_u8, ones);
-  }
-  return prod;
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t c_lo = vld1q_u8(second_pred);
-    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
-    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
-    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
-    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
-    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 32;
-    prod = vdotq_u32(prod, sad_lo_u8, ones);
-    prod = vdotq_u32(prod, sad_hi_u8, ones);
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int vpx_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
   }
-  return prod;
-}
 
-#define SAD32XN(n)                                                            \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t prod =                                                   \
-        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(prod);                                     \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t prod =                                                   \
-        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(prod);                                     \
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int vpx_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
   }
 
-#else  // defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32;
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(b_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(b_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(b_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(b_hi));
-  }
-  return abs;
+    second_pred += 64;
+  } while (--i != 0);
+
+  sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_uint32x4(sum_u32);
 }
 
-static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_lo = vld1q_u8(src_ptr);
-    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
-    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
-    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t c_lo = vld1q_u8(second_pred);
-    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
-    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
-    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    uint16x8_t sum0 = vpaddlq_u8(diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    uint16x8_t sum1 = vpaddlq_u8(diff1);
+
+    sum = vpadalq_u16(sum, sum0);
+    sum = vpadalq_u16(sum, sum1);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
     second_pred += 32;
-    abs = vabal_u8(abs, vget_low_u8(a_lo), vget_low_u8(avg_lo));
-    abs = vabal_u8(abs, vget_high_u8(a_lo), vget_high_u8(avg_lo));
-    abs = vabal_u8(abs, vget_low_u8(a_hi), vget_low_u8(avg_hi));
-    abs = vabal_u8(abs, vget_high_u8(a_hi), vget_high_u8(avg_hi));
-  }
-  return abs;
-}
+  } while (--i != 0);
 
-#define SAD32XN(n)                                                            \
-  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint16x8_t abs =                                                    \
-        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint16x8(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint16x8_t abs =                                                    \
-        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint16x8(abs);                                      \
-  }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
-SAD32XN(16)
-SAD32XN(32)
-SAD32XN(64)
-
-#if defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
-    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
-    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
-    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    prod = vdotq_u32(prod, sad_0_u8, ones);
-    prod = vdotq_u32(prod, sad_1_u8, ones);
-    prod = vdotq_u32(prod, sad_2_u8, ones);
-    prod = vdotq_u32(prod, sad_3_u8, ones);
-  }
-  return prod;
+  return horizontal_add_uint32x4(sum);
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint32x4_t prod = vdupq_n_u32(0);
-  const uint8x16_t ones = vdupq_n_u8(1);
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t c_0 = vld1q_u8(second_pred);
-    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
-    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
-    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
-    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
-    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
-    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
-    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
-    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
-    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
-    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    second_pred += 64;
-    prod = vdotq_u32(prod, sad_0_u8, ones);
-    prod = vdotq_u32(prod, sad_1_u8, ones);
-    prod = vdotq_u32(prod, sad_2_u8, ones);
-    prod = vdotq_u32(prod, sad_3_u8, ones);
-  }
-  return prod;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
-#else   // !defined(__ARM_FEATURE_DOTPROD)
-static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_ptr, int ref_stride,
-                                const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(b_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(b_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(b_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(b_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(b_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(b_3));
-  }
+    second_pred += 8;
+  } while (--i != 0);
 
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
+  return horizontal_add_uint16x8(sum);
 }
 
-static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    const uint8_t *second_pred,
-                                    const int height) {
-  int i;
-  uint16x8_t abs_0 = vdupq_n_u16(0);
-  uint16x8_t abs_1 = vdupq_n_u16(0);
-
-  for (i = 0; i < height; ++i) {
-    const uint8x16_t a_0 = vld1q_u8(src_ptr);
-    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
-    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
-    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
-    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
-    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
-    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
-    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
-    const uint8x16_t c_0 = vld1q_u8(second_pred);
-    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
-    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
-    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
-    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
-    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
-    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
-    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-    second_pred += 64;
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_0), vget_low_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_0), vget_high_u8(avg_0));
-    abs_0 = vabal_u8(abs_0, vget_low_u8(a_1), vget_low_u8(avg_1));
-    abs_0 = vabal_u8(abs_0, vget_high_u8(a_1), vget_high_u8(avg_1));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_2), vget_low_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_2), vget_high_u8(avg_2));
-    abs_1 = vabal_u8(abs_1, vget_low_u8(a_3), vget_low_u8(avg_3));
-    abs_1 = vabal_u8(abs_1, vget_high_u8(a_3), vget_high_u8(avg_3));
-  }
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
 
-  {
-    const uint32x4_t sum = vpaddlq_u16(abs_0);
-    return vpadalq_u16(sum, abs_1);
-  }
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_uint16x8(sum);
 }
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
-#define SAD64XN(n)                                                            \
-  uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
-                                const uint8_t *ref_ptr, int ref_stride) {     \
-    const uint32x4_t abs =                                                    \
-        sad64x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
-    return horizontal_add_uint32x4(abs);                                      \
-  }                                                                           \
-                                                                              \
-  uint32_t vpx_sad64x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
-                                    const uint8_t *ref_ptr, int ref_stride,   \
-                                    const uint8_t *second_pred) {             \
-    const uint32x4_t abs =                                                    \
-        sad64x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
-    return horizontal_add_uint32x4(abs);                                      \
+
+#define SAD_WXH_AVG_NEON(w, h)                                             \
+  uint32_t vpx_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),      \
+                               second_pred);                               \
   }
 
-SAD64XN(32)
-SAD64XN(64)
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+
+#undef SAD_WXH_AVG_NEON
diff --git a/vpx_dsp/arm/sad_neon_dotprod.c b/vpx_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 000000000..fbc0b8d75
--- /dev/null
+++ b/vpx_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,247 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int vpx_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int vpx_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  uint32_t vpx_sad##w##x##h##_avg_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
diff --git a/vpx_dsp/arm/sse_neon.c b/vpx_dsp/arm/sse_neon.c
new file mode 100644
index 000000000..f686dc350
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon.c
@@ -0,0 +1,186 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+int64_t vpx_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+  }
+}
diff --git a/vpx_dsp/arm/sse_neon_dotprod.c b/vpx_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 000000000..877777391
--- /dev/null
+++ b/vpx_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_uint32x2(sse);
+}
+
+int64_t vpx_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/vpx_dsp/arm/subpel_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
index 9328c3ed8..d92f1615d 100644
--- a/vpx_dsp/arm/subpel_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c
@@ -143,59 +143,58 @@ static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
     return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
-  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                        \
-      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
-      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
-    if (xoffset == 0) {                                                       \
-      if (yoffset == 0) {                                                     \
-        return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
-                                            sse);                             \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      } else {                                                                \
-        uint8_t tmp[w * h];                                                   \
-        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
-                                    yoffset);                                 \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
-      }                                                                       \
-    } else if (xoffset == 4) {                                                \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * (h + padding)];                                      \
-        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    } else {                                                                  \
-      uint8_t tmp0[w * (h + padding)];                                        \
-      if (yoffset == 0) {                                                     \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
-      } else if (yoffset == 4) {                                              \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      } else {                                                                \
-        uint8_t tmp1[w * h];                                                  \
-        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
-                                    xoffset);                                 \
-        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
-      }                                                                       \
-    }                                                                         \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return vpx_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
   }
 
 // 4x<h> blocks are processed two rows at a time, so require an extra row of
@@ -418,53 +417,53 @@ static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
       uint8_t tmp[w * h];                                                      \
       if (yoffset == 0) {                                                      \
         avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else if (yoffset == 4) {                                               \
         avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
                                         source_stride, w, h, second_pred);     \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       } else {                                                                 \
         avg_pred_var_filter_block2d_bil_w##w(                                  \
             src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+        return vpx_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
       }                                                                        \
     } else if (xoffset == 4) {                                                 \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
                                         second_pred);                          \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * (h + padding)];                                       \
         var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     } else {                                                                   \
       uint8_t tmp0[w * (h + padding)];                                         \
       if (yoffset == 0) {                                                      \
         avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
                                              xoffset, second_pred);            \
-        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
       } else if (yoffset == 4) {                                               \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       } else {                                                                 \
         uint8_t tmp1[w * h];                                                   \
         var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
                                     (h + padding), xoffset);                   \
         avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
                                              second_pred);                     \
-        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+        return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
       }                                                                        \
     }                                                                          \
   }
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index 9a7c424e8..11821dc10 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -16,8 +16,51 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
+static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  const uint16x4_t b = vpaddl_u8(a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  const uint16x4_t d = vpadd_u16(c, c);
+  return vget_lane_u16(d, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u8(a);
+#else
+  const uint16x8_t b = vpaddlq_u8(a);
+  const uint16x4_t c = vadd_u16(vget_low_u16(b), vget_high_u16(b));
+  const uint16x4_t d = vpadd_u16(c, c);
+  const uint16x4_t e = vpadd_u16(d, d);
+  return vget_lane_u16(e, 0);
+#endif
+}
+
+static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddv_u16(a);
+#else
+  const uint16x4_t b = vpadd_u16(a, a);
+  const uint16x4_t c = vpadd_u16(b, b);
+  return vget_lane_u16(c, 0);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s16(a);
 #else
   const int32x4_t b = vpaddlq_s16(a);
@@ -29,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u16(a);
 #else
   const uint32x4_t b = vpaddlq_u16(a);
@@ -40,8 +83,67 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 #endif
 }
 
+static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
+                                                    const uint16x8_t vec_hi) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
+    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if VPX_ARCH_AARCH64
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  return vpaddq_u32(c0, c1);
+#else
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  return vcombine_u32(d0, d1);
+#endif
+}
+
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_s32(a);
 #else
   return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
@@ -49,15 +151,16 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_u32(a);
 #else
-  return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
 #endif
 }
 
 static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_s32(a);
 #else
   const int64x2_t b = vpaddlq_s32(a);
@@ -68,7 +171,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
@@ -77,4 +180,96 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
   return vget_lane_u32(c, 0);
 #endif
 }
+
+static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
+#if VPX_ARCH_AARCH64
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_uint32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_s64(a);
+#else
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
+#if VPX_ARCH_AARCH64
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_uint32x4(a[0]) +
+         horizontal_long_add_uint32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_uint64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_uint32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t
+horizontal_long_add_uint32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_uint64x2(vaddq_u64(sum[0], sum[1]));
+}
+
 #endif  // VPX_VPX_DSP_ARM_SUM_NEON_H_
diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c
index cfefad993..074afe325 100644
--- a/vpx_dsp/arm/sum_squares_neon.c
+++ b/vpx_dsp/arm/sum_squares_neon.c
@@ -9,77 +9,92 @@
  */
 
 #include <arm_neon.h>
-
 #include <assert.h>
+
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/sum_neon.h"
 
 uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
-  uint64x1_t s2;
-
   if (size == 4) {
     int16x4_t s[4];
-    int32x4_t s0;
-    uint32x2_t s1;
+    int32x4_t sum_s32;
 
     s[0] = vld1_s16(src + 0 * stride);
     s[1] = vld1_s16(src + 1 * stride);
     s[2] = vld1_s16(src + 2 * stride);
     s[3] = vld1_s16(src + 3 * stride);
-    s0 = vmull_s16(s[0], s[0]);
-    s0 = vmlal_s16(s0, s[1], s[1]);
-    s0 = vmlal_s16(s0, s[2], s[2]);
-    s0 = vmlal_s16(s0, s[3], s[3]);
-    s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
-                   vget_high_u32(vreinterpretq_u32_s32(s0)));
-    s2 = vpaddl_u32(s1);
+
+    sum_s32 = vmull_s16(s[0], s[0]);
+    sum_s32 = vmlal_s16(sum_s32, s[1], s[1]);
+    sum_s32 = vmlal_s16(sum_s32, s[2], s[2]);
+    sum_s32 = vmlal_s16(sum_s32, s[3], s[3]);
+
+    return horizontal_long_add_uint32x4(vreinterpretq_u32_s32(sum_s32));
   } else {
-    int r = size;
-    uint64x2_t s1 = vdupq_n_u64(0);
+    uint64x2_t sum_u64 = vdupq_n_u64(0);
+    int rows = size;
 
     do {
-      int c = size;
-      int32x4_t s0 = vdupq_n_s32(0);
-      const int16_t *src_t = src;
+      const int16_t *src_ptr = src;
+      int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+      int cols = size;
 
       do {
         int16x8_t s[8];
 
-        s[0] = vld1q_s16(src_t + 0 * stride);
-        s[1] = vld1q_s16(src_t + 1 * stride);
-        s[2] = vld1q_s16(src_t + 2 * stride);
-        s[3] = vld1q_s16(src_t + 3 * stride);
-        s[4] = vld1q_s16(src_t + 4 * stride);
-        s[5] = vld1q_s16(src_t + 5 * stride);
-        s[6] = vld1q_s16(src_t + 6 * stride);
-        s[7] = vld1q_s16(src_t + 7 * stride);
-        s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
-        s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
-        src_t += 8;
-        c -= 8;
-      } while (c);
+        s[0] = vld1q_s16(src_ptr + 0 * stride);
+        s[1] = vld1q_s16(src_ptr + 1 * stride);
+        s[2] = vld1q_s16(src_ptr + 2 * stride);
+        s[3] = vld1q_s16(src_ptr + 3 * stride);
+        s[4] = vld1q_s16(src_ptr + 4 * stride);
+        s[5] = vld1q_s16(src_ptr + 5 * stride);
+        s[6] = vld1q_s16(src_ptr + 6 * stride);
+        s[7] = vld1q_s16(src_ptr + 7 * stride);
 
-      s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
-      s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[0]), vget_low_s16(s[0]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[1]), vget_low_s16(s[1]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[2]), vget_low_s16(s[2]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[3]), vget_low_s16(s[3]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[4]), vget_low_s16(s[4]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[5]), vget_low_s16(s[5]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[6]), vget_low_s16(s[6]));
+        sum_s32[0] =
+            vmlal_s16(sum_s32[0], vget_low_s16(s[7]), vget_low_s16(s[7]));
+
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[0]), vget_high_s16(s[0]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[1]), vget_high_s16(s[1]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[2]), vget_high_s16(s[2]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[3]), vget_high_s16(s[3]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[4]), vget_high_s16(s[4]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[5]), vget_high_s16(s[5]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[6]), vget_high_s16(s[6]));
+        sum_s32[1] =
+            vmlal_s16(sum_s32[1], vget_high_s16(s[7]), vget_high_s16(s[7]));
+
+        src_ptr += 8;
+        cols -= 8;
+      } while (cols);
+
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[0]));
+      sum_u64 = vpadalq_u32(sum_u64, vreinterpretq_u32_s32(sum_s32[1]));
       src += 8 * stride;
-      r -= 8;
-    } while (r);
+      rows -= 8;
+    } while (rows);
 
-    s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+    return horizontal_add_uint64x2(sum_u64);
   }
-
-  return vget_lane_u64(s2, 0);
 }
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 41d44f2b1..74f85a6bb 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -23,44 +23,77 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
                            vreinterpret_s16_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
                            vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
   return b0;
 }
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+  b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
+#else
   b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
                            vreinterpret_s64_s32(vget_low_s32(a1)));
   b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
                            vreinterpret_s64_s32(vget_high_s32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u8_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u8_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
                           vreinterpret_u8_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
                           vreinterpret_u8_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
+#if VPX_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
   b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
                            vreinterpret_u16_u32(vget_low_u32(a1)));
   b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
                            vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
   return b0;
 }
 
@@ -141,17 +174,13 @@ static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const int32x4_t c0 =
-      vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
-  const int32x4_t c1 =
-      vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
+  const int16x8x2_t c0 = vpx_vtrnq_s64_to_s16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const int16x8x2_t d0 =
-      vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
+  const int16x8x2_t d0 = vtrnq_s16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -172,17 +201,13 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
   // c0: 00 01 20 21  02 03 22 23
   // c1: 10 11 30 31  12 13 32 33
 
-  const uint32x4_t c0 =
-      vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
-  const uint32x4_t c1 =
-      vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
+  const uint16x8x2_t c0 = vpx_vtrnq_u64_to_u16(b0.val[0], b0.val[1]);
 
   // Swap 16 bit elements resulting in:
   // d0.val[0]: 00 10 20 30  02 12 22 32
   // d0.val[1]: 01 11 21 31  03 13 23 33
 
-  const uint16x8x2_t d0 =
-      vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
+  const uint16x8x2_t d0 = vtrnq_u16(c0.val[0], c0.val[1]);
 
   *a0 = d0.val[0];
   *a1 = d0.val[1];
@@ -281,7 +306,7 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
                                      const int16x4_t a6, const int16x4_t a7,
                                      int16x8_t *const o0, int16x8_t *const o1,
                                      int16x8_t *const o2, int16x8_t *const o3) {
-  // Swap 16 bit elements. Goes from:
+  // Combine rows. Goes from:
   // a0: 00 01 02 03
   // a1: 10 11 12 13
   // a2: 20 21 22 23
@@ -291,53 +316,40 @@ static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
   // a6: 60 61 62 63
   // a7: 70 71 72 73
   // to:
-  // b0.val[0]: 00 10 02 12
-  // b0.val[1]: 01 11 03 13
-  // b1.val[0]: 20 30 22 32
-  // b1.val[1]: 21 31 23 33
-  // b2.val[0]: 40 50 42 52
-  // b2.val[1]: 41 51 43 53
-  // b3.val[0]: 60 70 62 72
-  // b3.val[1]: 61 71 63 73
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
+
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
 
-  const int16x4x2_t b0 = vtrn_s16(a0, a1);
-  const int16x4x2_t b1 = vtrn_s16(a2, a3);
-  const int16x4x2_t b2 = vtrn_s16(a4, a5);
-  const int16x4x2_t b3 = vtrn_s16(a6, a7);
+  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
 
   // Swap 32 bit elements resulting in:
-  // c0.val[0]: 00 10 20 30
-  // c0.val[1]: 02 12 22 32
-  // c1.val[0]: 01 11 21 31
-  // c1.val[1]: 03 13 23 33
-  // c2.val[0]: 40 50 60 70
-  // c2.val[1]: 42 52 62 72
-  // c3.val[0]: 41 51 61 71
-  // c3.val[1]: 43 53 63 73
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
 
-  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
-                                  vreinterpret_s32_s16(b1.val[0]));
-  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
-                                  vreinterpret_s32_s16(b1.val[1]));
-  const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
-                                  vreinterpret_s32_s16(b3.val[0]));
-  const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
-                                  vreinterpret_s32_s16(b3.val[1]));
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
 
-  // Swap 64 bit elements resulting in:
-  // o0: 00 10 20 30 40 50 60 70
-  // o1: 01 11 21 31 41 51 61 71
-  // o2: 02 12 22 32 42 52 62 72
-  // o3: 03 13 23 33 43 53 63 73
-
-  *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
-                     vreinterpret_s16_s32(c2.val[0]));
-  *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
-                     vreinterpret_s16_s32(c3.val[0]));
-  *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
-                     vreinterpret_s16_s32(c2.val[1]));
-  *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
-                     vreinterpret_s16_s32(c3.val[1]));
+  *o0 = vreinterpretq_s16_s32(d0.val[0]);
+  *o1 = vreinterpretq_s16_s32(d1.val[0]);
+  *o2 = vreinterpretq_s16_s32(d0.val[1]);
+  *o3 = vreinterpretq_s16_s32(d1.val[1]);
 }
 
 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
@@ -569,37 +581,73 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
 }
 
 // Transpose 8x8 to a new location.
-static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
+static INLINE void transpose_s16_8x8q(int16x8_t *a, int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
 }
 
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
@@ -658,6 +706,7 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
@@ -729,6 +778,7 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
   // d2.val[1]: 06 16 26 36 46 56 66 76
   // d3.val[0]: 03 13 23 33 43 53 63 73
   // d3.val[1]: 07 17 27 37 47 57 67 77
+
   const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
   const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
   const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
@@ -866,6 +916,68 @@ static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
   out_right[7] = out[7].val[1];
 }
 
+static INLINE void transpose_s32_16x16(int32x4_t *left1, int32x4_t *right1,
+                                       int32x4_t *left2, int32x4_t *right2) {
+  int32x4_t tl[16], tr[16];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  tl[0] = left1[8];
+  tl[1] = left1[9];
+  tl[2] = left1[10];
+  tl[3] = left1[11];
+  tl[4] = left1[12];
+  tl[5] = left1[13];
+  tl[6] = left1[14];
+  tl[7] = left1[15];
+  tr[0] = right1[8];
+  tr[1] = right1[9];
+  tr[2] = right1[10];
+  tr[3] = right1[11];
+  tr[4] = right1[12];
+  tr[5] = right1[13];
+  tr[6] = right1[14];
+  tr[7] = right1[15];
+
+  left1[8] = left2[0];
+  left1[9] = left2[1];
+  left1[10] = left2[2];
+  left1[11] = left2[3];
+  left1[12] = left2[4];
+  left1[13] = left2[5];
+  left1[14] = left2[6];
+  left1[15] = left2[7];
+  right1[8] = right2[0];
+  right1[9] = right2[1];
+  right1[10] = right2[2];
+  right1[11] = right2[3];
+  right1[12] = right2[4];
+  right1[13] = right2[5];
+  right1[14] = right2[6];
+  right1[15] = right2[7];
+
+  left2[0] = tl[0];
+  left2[1] = tl[1];
+  left2[2] = tl[2];
+  left2[3] = tl[3];
+  left2[4] = tl[4];
+  left2[5] = tl[5];
+  left2[6] = tl[6];
+  left2[7] = tl[7];
+  right2[0] = tr[0];
+  right2[1] = tr[1];
+  right2[2] = tr[2];
+  right2[3] = tr[3];
+  right2[4] = tr[4];
+  right2[5] = tr[5];
+  right2[6] = tr[6];
+  right2[7] = tr[7];
+
+  transpose_s32_8x8_2(left1, right1, left1, right1);
+  transpose_s32_8x8_2(left2, right2, left2, right2);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left2 + 8, right2 + 8);
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
index 3ccc4e807..efb2c1d8d 100644
--- a/vpx_dsp/arm/variance_neon.c
+++ b/vpx_dsp/arm/variance_neon.c
@@ -19,143 +19,6 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-#if defined(__ARM_FEATURE_DOTPROD)
-
-// Process a block of width 4 four rows at a time.
-static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-    i -= 4;
-  } while (i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of width 8 two rows at a time.
-static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
-                                     const uint8_t *ref_ptr, int ref_stride,
-                                     int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s =
-        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
-    const uint8x16_t r =
-        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += 2 * src_stride;
-    ref_ptr += 2 * ref_stride;
-    i -= 2;
-  } while (i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of width 16 one row at a time.
-static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride,
-                                      int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    const uint8x16_t s = vld1q_u8(src_ptr);
-    const uint8x16_t r = vld1q_u8(ref_ptr);
-
-    const uint8x16_t abs_diff = vabdq_u8(s, r);
-    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-// Process a block of any size where the width is divisible by 16.
-static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
-                                       const uint8_t *ref_ptr, int ref_stride,
-                                       int w, int h, uint32_t *sse, int *sum) {
-  uint32x4_t src_sum = vdupq_n_u32(0);
-  uint32x4_t ref_sum = vdupq_n_u32(0);
-  uint32x4_t sse_u32 = vdupq_n_u32(0);
-
-  int i = h;
-  do {
-    int j = 0;
-    do {
-      const uint8x16_t s = vld1q_u8(src_ptr + j);
-      const uint8x16_t r = vld1q_u8(ref_ptr + j);
-
-      const uint8x16_t abs_diff = vabdq_u8(s, r);
-      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
-
-      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
-      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
-
-      j += 16;
-    } while (j < w);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  } while (--i != 0);
-
-  *sum = horizontal_add_int32x4(
-      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
-  *sse = horizontal_add_uint32x4(sse_u32);
-}
-
-static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
-}
-
-static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
-                                      const uint8_t *ref, int ref_stride, int h,
-                                      uint32_t *sse, int *sum) {
-  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
-
 // Process a block of width 4 two rows at a time.
 static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
                                      const uint8_t *ref_ptr, int ref_stride,
@@ -328,8 +191,6 @@ static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
   variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
-
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
@@ -369,165 +230,103 @@ VARIANCE_WXH_NEON(32, 64, 11)
 VARIANCE_WXH_NEON(64, 32, 11)
 VARIANCE_WXH_NEON(64, 64, 12)
 
-#if defined(__ARM_FEATURE_DOTPROD)
+#undef VARIANCE_WXH_NEON
+
+static INLINE unsigned int vpx_mse8xh_neon(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int *sse) {
-  int i;
-  uint8x16_t a[2], b[2], abs_diff[2];
-  uint32x4_t sse_vec[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+    uint16x8_t sse0, sse1;
 
-  for (i = 0; i < 8; i++) {
-    a[0] = vld1q_u8(src_ptr);
+    s0 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    a[1] = vld1q_u8(src_ptr);
+    s1 = vld1_u8(src_ptr);
     src_ptr += src_stride;
-    b[0] = vld1q_u8(ref_ptr);
+    r0 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
-    b[1] = vld1q_u8(ref_ptr);
+    r1 = vld1_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    abs_diff[0] = vabdq_u8(a[0], b[0]);
-    abs_diff[1] = vabdq_u8(a[1], b[1]);
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
 
-    sse_vec[0] = vdotq_u32(sse_vec[0], abs_diff[0], abs_diff[0]);
-    sse_vec[1] = vdotq_u32(sse_vec[1], abs_diff[1], abs_diff[1]);
-  }
+    sse0 = vmull_u8(diff0, diff0);
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(diff1, diff1);
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
-  return horizontal_add_uint32x4(vaddq_u32(sse_vec[0], sse_vec[1]));
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
-unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
-                                   const unsigned char *ref_ptr,
-                                   int ref_stride) {
-  uint8x8_t a[4], b[4], abs_diff[4];
-  uint32x2_t sse = vdup_n_u32(0);
-
-  a[0] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[0] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[1] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[1] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[2] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[2] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[3] = vld1_u8(src_ptr);
-  b[3] = vld1_u8(ref_ptr);
-
-  abs_diff[0] = vabd_u8(a[0], b[0]);
-  abs_diff[1] = vabd_u8(a[1], b[1]);
-  abs_diff[2] = vabd_u8(a[2], b[2]);
-  abs_diff[3] = vabd_u8(a[3], b[3]);
-
-  sse = vdot_u32(sse, abs_diff[0], abs_diff[0]);
-  sse = vdot_u32(sse, abs_diff[1], abs_diff[1]);
-  sse = vdot_u32(sse, abs_diff[2], abs_diff[2]);
-  sse = vdot_u32(sse, abs_diff[3], abs_diff[3]);
-
-  return vget_lane_u32(sse, 0);
-}
-
-#else  // !defined(__ARM_FEATURE_DOTPROD)
+static INLINE unsigned int vpx_mse16xh_neon(const unsigned char *src_ptr,
+                                            int src_stride,
+                                            const unsigned char *ref_ptr,
+                                            int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
 
-unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
-                               const unsigned char *ref_ptr, int ref_stride,
-                               unsigned int *sse) {
-  int i;
-  uint8x16_t a[2], b[2];
-  int16x4_t diff_lo[4], diff_hi[4];
-  uint16x8_t diff[4];
-  int32x4_t sse_vec[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
-                           vdupq_n_s32(0) };
+  int i = h;
+  do {
+    uint8x16_t s, r, diff;
+    uint16x8_t sse0, sse1;
 
-  for (i = 0; i < 8; i++) {
-    a[0] = vld1q_u8(src_ptr);
-    src_ptr += src_stride;
-    a[1] = vld1q_u8(src_ptr);
+    s = vld1q_u8(src_ptr);
     src_ptr += src_stride;
-    b[0] = vld1q_u8(ref_ptr);
-    ref_ptr += ref_stride;
-    b[1] = vld1q_u8(ref_ptr);
+    r = vld1q_u8(ref_ptr);
     ref_ptr += ref_stride;
 
-    diff[0] = vsubl_u8(vget_low_u8(a[0]), vget_low_u8(b[0]));
-    diff[1] = vsubl_u8(vget_high_u8(a[0]), vget_high_u8(b[0]));
-    diff[2] = vsubl_u8(vget_low_u8(a[1]), vget_low_u8(b[1]));
-    diff[3] = vsubl_u8(vget_high_u8(a[1]), vget_high_u8(b[1]));
-
-    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
-    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
-    sse_vec[0] = vmlal_s16(sse_vec[0], diff_lo[0], diff_lo[0]);
-    sse_vec[1] = vmlal_s16(sse_vec[1], diff_lo[1], diff_lo[1]);
-
-    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
-    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
-    sse_vec[2] = vmlal_s16(sse_vec[2], diff_lo[2], diff_lo[2]);
-    sse_vec[3] = vmlal_s16(sse_vec[3], diff_lo[3], diff_lo[3]);
-
-    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
-    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
-    sse_vec[0] = vmlal_s16(sse_vec[0], diff_hi[0], diff_hi[0]);
-    sse_vec[1] = vmlal_s16(sse_vec[1], diff_hi[1], diff_hi[1]);
-
-    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
-    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
-    sse_vec[2] = vmlal_s16(sse_vec[2], diff_hi[2], diff_hi[2]);
-    sse_vec[3] = vmlal_s16(sse_vec[3], diff_hi[3], diff_hi[3]);
-  }
+    diff = vabdq_u8(s, r);
 
-  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[1]);
-  sse_vec[2] = vaddq_s32(sse_vec[2], sse_vec[3]);
-  sse_vec[0] = vaddq_s32(sse_vec[0], sse_vec[2]);
+    sse0 = vmull_u8(vget_low_u8(diff), vget_low_u8(diff));
+    sse_u32[0] = vpadalq_u16(sse_u32[0], sse0);
+    sse1 = vmull_u8(vget_high_u8(diff), vget_high_u8(diff));
+    sse_u32[1] = vpadalq_u16(sse_u32[1], sse1);
+  } while (--i != 0);
 
-  *sse = horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
-  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse_vec[0]));
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
 }
 
 unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
                                    const unsigned char *ref_ptr,
                                    int ref_stride) {
-  uint8x8_t a[4], b[4];
-  int16x4_t diff_lo[4];
-  uint16x8_t diff[4];
-  int32x4_t sse;
-
-  a[0] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[0] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[1] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[1] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[2] = vld1_u8(src_ptr);
-  src_ptr += src_stride;
-  b[2] = vld1_u8(ref_ptr);
-  ref_ptr += ref_stride;
-  a[3] = vld1_u8(src_ptr);
-  b[3] = vld1_u8(ref_ptr);
-
-  diff[0] = vsubl_u8(a[0], b[0]);
-  diff[1] = vsubl_u8(a[1], b[1]);
-  diff[2] = vsubl_u8(a[2], b[2]);
-  diff[3] = vsubl_u8(a[3], b[3]);
-
-  diff_lo[0] = vget_low_s16(vreinterpretq_s16_u16(diff[0]));
-  diff_lo[1] = vget_low_s16(vreinterpretq_s16_u16(diff[1]));
-  diff_lo[2] = vget_low_s16(vreinterpretq_s16_u16(diff[2]));
-  diff_lo[3] = vget_low_s16(vreinterpretq_s16_u16(diff[3]));
-
-  sse = vmull_s16(diff_lo[0], diff_lo[0]);
-  sse = vmlal_s16(sse, diff_lo[1], diff_lo[1]);
-  sse = vmlal_s16(sse, diff_lo[2], diff_lo[2]);
-  sse = vmlal_s16(sse, diff_lo[3], diff_lo[3]);
-
-  return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
+  uint8x8_t s[2], r[2];
+  uint16x8_t abs_diff[2];
+  uint32x4_t sse;
+
+  s[0] = load_u8(src_ptr, src_stride);
+  r[0] = load_u8(ref_ptr, ref_stride);
+  src_ptr += 2 * src_stride;
+  ref_ptr += 2 * ref_stride;
+  s[1] = load_u8(src_ptr, src_stride);
+  r[1] = load_u8(ref_ptr, ref_stride);
+
+  abs_diff[0] = vabdl_u8(s[0], r[0]);
+  abs_diff[1] = vabdl_u8(s[1], r[1]);
+
+  sse = vmull_u16(vget_low_u16(abs_diff[0]), vget_low_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[0]), vget_high_u16(abs_diff[0]));
+  sse = vmlal_u16(sse, vget_low_u16(abs_diff[1]), vget_low_u16(abs_diff[1]));
+  sse = vmlal_u16(sse, vget_high_u16(abs_diff[1]), vget_high_u16(abs_diff[1]));
+
+  return horizontal_add_uint32x4(sse);
 }
 
-#endif  // defined(__ARM_FEATURE_DOTPROD)
+#define VPX_MSE_WXH_NEON(w, h)                                               \
+  unsigned int vpx_mse##w##x##h##_neon(                                      \
+      const unsigned char *src_ptr, int src_stride,                          \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) {     \
+    *sse = vpx_mse##w##xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+    return *sse;                                                             \
+  }
+
+VPX_MSE_WXH_NEON(8, 8)
+VPX_MSE_WXH_NEON(8, 16)
+VPX_MSE_WXH_NEON(16, 8)
+VPX_MSE_WXH_NEON(16, 16)
+
+#undef VPX_MSE_WXH_NEON
diff --git a/vpx_dsp/arm/variance_neon_dotprod.c b/vpx_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 000000000..ab843e9fc
--- /dev/null
+++ b/vpx_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,298 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+// Process a block of width 4 four rows at a time.
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 4 * src_stride;
+    ref_ptr += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+void vpx_get8x8var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                unsigned int *sse, int *sum) {
+  variance_8xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse,
+                            sum);
+}
+
+void vpx_get16x16var_neon_dotprod(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse, int *sum) {
+  variance_16xh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse,
+                             sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int vpx_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+static INLINE unsigned int vpx_mse8xh_neon_dotprod(const unsigned char *src_ptr,
+                                                   int src_stride,
+                                                   const unsigned char *ref_ptr,
+                                                   int ref_stride, int h) {
+  uint32x2_t sse_u32[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x8_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabd_u8(s0, r0);
+    diff1 = vabd_u8(s1, r1);
+
+    sse_u32[0] = vdot_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdot_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x2(vadd_u32(sse_u32[0], sse_u32[1]));
+}
+
+static INLINE unsigned int vpx_mse16xh_neon_dotprod(
+    const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr,
+    int ref_stride, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    s1 = vld1q_u8(src_ptr);
+    src_ptr += src_stride;
+    r0 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+    r1 = vld1q_u8(ref_ptr);
+    ref_ptr += ref_stride;
+
+    diff0 = vabdq_u8(s0, r0);
+    diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], diff0, diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], diff1, diff1);
+  } while (--i != 0);
+
+  return horizontal_add_uint32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+unsigned int vpx_get4x4sse_cs_neon_dotprod(const unsigned char *src_ptr,
+                                           int src_stride,
+                                           const unsigned char *ref_ptr,
+                                           int ref_stride) {
+  uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+  uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  uint32x4_t sse = vdotq_u32(vdupq_n_u32(0), abs_diff, abs_diff);
+
+  return horizontal_add_uint32x4(sse);
+}
+
+#define VPX_MSE_WXH_NEON_DOTPROD(w, h)                                   \
+  unsigned int vpx_mse##w##x##h##_neon_dotprod(                          \
+      const unsigned char *src_ptr, int src_stride,                      \
+      const unsigned char *ref_ptr, int ref_stride, unsigned int *sse) { \
+    *sse = vpx_mse##w##xh_neon_dotprod(src_ptr, src_stride, ref_ptr,     \
+                                       ref_stride, h);                   \
+    return *sse;                                                         \
+  }
+
+VPX_MSE_WXH_NEON_DOTPROD(8, 8)
+VPX_MSE_WXH_NEON_DOTPROD(8, 16)
+VPX_MSE_WXH_NEON_DOTPROD(16, 8)
+VPX_MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef VPX_MSE_WXH_NEON_DOTPROD
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b4cdd58c7..8b89862ba 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -17,6 +17,7 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
 #include "vpx_ports/mem.h"
 
 // Note:
@@ -31,1240 +32,6 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && \
-    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
-  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
-  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
-  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
-  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
-  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
-};
-
-DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
-  /* Shift left and insert new last column in transposed 4x4 block. */
-  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
-  /* Shift left and insert two new columns in transposed 4x4 block. */
-  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
-  /* Shift left and insert three new columns in transposed 4x4 block. */
-  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
-};
-
-#if defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *filter, int x0_q4,
-                                  int x_step_q4, int y0_q4, int y_step_q4,
-                                  int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  uint8x16_t s0, s1, s2, s3;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
-      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
-      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
-      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
-        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
-        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
-        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b = vqtbl2q_u8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
-                                        uint8x8_t a2, uint8x8_t a3,
-                                        uint8x16_t *b0, uint8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
-  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
-}
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  uint8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
-    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-    src += 7 * src_stride;
-
-    s7 = vdup_n_u8(0);
-    s8 = vdup_n_u8(0);
-    s9 = vdup_n_u8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
-      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
-      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
-      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      s += 7 * src_stride;
-
-      s7 = vdup_n_u8(0);
-      s8 = vdup_n_u8(0);
-      s9 = vdup_n_u8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                       filters);
-        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                       filters);
-        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                       filters);
-        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                       filters);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
-
-void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                              uint8_t *dst, ptrdiff_t dst_stride,
-                              const InterpKernel *filter, int x0_q4,
-                              int x_step_q4, int y0_q4, int y_step_q4, int w,
-                              int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23;
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *filter, int x0_q4,
-                                  int x_step_q4, int y0_q4, int y_step_q4,
-                                  int w, int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x16_t range_limit = vdupq_n_u8(128);
-  uint8x16_t s0, s1, s2, s3;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(x_step_q4 == 16);
-
-  (void)x_step_q4;
-  (void)y0_q4;
-  (void)y_step_q4;
-
-  src -= 3;
-
-  if (w == 4) {
-    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
-    do {
-      int32x4_t t0, t1, t2, t3;
-      int16x8_t t01, t23;
-      uint8x8_t d01, d23, dd01, dd23;
-      dd01 = vdup_n_u8(0);
-      dd23 = vdup_n_u8(0);
-
-      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
-      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
-      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
-      d01 = vqrshrun_n_s16(t01, 7);
-      d23 = vqrshrun_n_s16(t23, 7);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
-    const uint8_t *s;
-    uint8_t *d;
-    int width;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-
-    do {
-      width = w;
-      s = src;
-      d = dst;
-      do {
-        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
-
-        d0 =
-            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
-        d1 =
-            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
-        d2 =
-            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
-        d3 =
-            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  }
-}
-
-static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b,
-                                        const uint8x16_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, XX, XX, XX, XX
-   * a1: 10, 11, 12, 13, XX, XX, XX, XX
-   * a2: 20, 21, 22, 23, XX, XX, XX, XX
-   * a3: 30, 31, 32, 33, XX, XX, XX, XX
-   *
-   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
-}
-
-static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
-                                        int8x8_t a3, int8x16_t *b0,
-                                        int8x16_t *b1,
-                                        const uint8x16x2_t permute_tbl) {
-  /* Transpose 8-bit elements and concatenate result rows as follows:
-   * a0: 00, 01, 02, 03, 04, 05, 06, 07
-   * a1: 10, 11, 12, 13, 14, 15, 16, 17
-   * a2: 20, 21, 22, 23, 24, 25, 26, 27
-   * a3: 30, 31, 32, 33, 34, 35, 36, 37
-   *
-   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
-   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
-   *
-   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
-   * as an argument is preferable to loading it directly from memory as this
-   * inline helper is called many times from the same parent function.
-   */
-
-  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
-}
-
-void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *filter, int x0_q4,
-                             int x_step_q4, int y0_q4, int y_step_q4, int w,
-                             int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *filter, int x0_q4,
-                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
-                                 int h) {
-  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
-  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
-  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
-  const uint8x8_t range_limit = vdup_n_u8(128);
-  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
-  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
-  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-  int8x16x2_t samples_LUT;
-
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
-  assert(y_step_q4 == 16);
-
-  (void)x0_q4;
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  src -= 3 * src_stride;
-
-  if (w == 4) {
-    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
-    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
-    int32x4_t d0, d1, d2, d3;
-    uint8x8_t d01, d23, dd01, dd23;
-
-    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-    src += 7 * src_stride;
-
-    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-    s7 = vdup_n_s8(0);
-    s8 = vdup_n_s8(0);
-    s9 = vdup_n_s8(0);
-
-    /* This operation combines a conventional transpose and the sample permute
-     * (see horizontal case) required before computing the dot product.
-     */
-    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
-
-    do {
-      uint8x8_t t7, t8, t9, t10;
-
-      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
-
-      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
-
-      /* Merge new data into block from previous iteration. */
-      samples_LUT.val[0] = s3456;
-      samples_LUT.val[1] = s78910;
-      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
-      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
-
-      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
-      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
-
-      d01 = vrhadd_u8(d01, dd01);
-      d23 = vrhadd_u8(d23, dd23);
-
-      store_u8(dst + 0 * dst_stride, dst_stride, d01);
-      store_u8(dst + 2 * dst_stride, dst_stride, d23);
-
-      /* Prepare block for next iteration - re-using as much as possible. */
-      /* Shuffle everything up four rows. */
-      s0123 = s4567;
-      s1234 = s5678;
-      s2345 = s6789;
-      s3456 = s78910;
-
-      src += 4 * src_stride;
-      dst += 4 * dst_stride;
-      h -= 4;
-    } while (h > 0);
-  } else {
-    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
-    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
-        s6789_hi, s78910_lo, s78910_hi;
-    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
-    const uint8_t *s;
-    uint8_t *d;
-    int height;
-
-    do {
-      height = h;
-      s = src;
-      d = dst;
-
-      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
-      s += 7 * src_stride;
-
-      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
-      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
-      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
-      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
-      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
-      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
-      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
-      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
-      s7 = vdup_n_s8(0);
-      s8 = vdup_n_s8(0);
-      s9 = vdup_n_s8(0);
-
-      /* This operation combines a conventional transpose and the sample permute
-       * (see horizontal case) required before computing the dot product.
-       */
-      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
-                           tran_concat_tbl);
-      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
-                           tran_concat_tbl);
-
-      do {
-        uint8x8_t t7, t8, t9, t10;
-
-        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
-
-        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
-        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
-        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
-        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
-
-        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
-                             tran_concat_tbl);
-
-        /* Merge new data into block from previous iteration. */
-        samples_LUT.val[0] = s3456_lo;
-        samples_LUT.val[1] = s78910_lo;
-        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        samples_LUT.val[0] = s3456_hi;
-        samples_LUT.val[1] = s78910_hi;
-        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
-
-        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                      correction, filters);
-        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                      correction, filters);
-        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                      correction, filters);
-        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                      correction, filters);
-
-        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
-
-        d0 = vrhadd_u8(d0, dd0);
-        d1 = vrhadd_u8(d1, dd1);
-        d2 = vrhadd_u8(d2, dd2);
-        d3 = vrhadd_u8(d3, dd3);
-
-        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-        /* Prepare block for next iteration - re-using as much as possible. */
-        /* Shuffle everything up four rows. */
-        s0123_lo = s4567_lo;
-        s0123_hi = s4567_hi;
-        s1234_lo = s5678_lo;
-        s1234_hi = s5678_hi;
-        s2345_lo = s6789_lo;
-        s2345_hi = s6789_hi;
-        s3456_lo = s78910_lo;
-        s3456_hi = s78910_hi;
-
-        s += 4 * src_stride;
-        d += 4 * dst_stride;
-        height -= 4;
-      } while (height > 0);
-      src += 8;
-      dst += 8;
-      w -= 8;
-    } while (w > 0);
-  }
-}
-
-#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
-
-#else  // !(defined(__aarch64__) &&
-       //   (defined(__ARM_FEATURE_DOTPROD) ||
-       //    defined(__ARM_FEATURE_MATMUL_INT8)))
-
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -1273,8 +40,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1286,25 +53,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   if (h == 4) {
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
+
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1314,32 +78,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      vst1_lane_u32((uint32_t *)(dst + 0 * dst_stride),
-                    vreinterpret_u32_u8(d01), 0);
-      vst1_lane_u32((uint32_t *)(dst + 1 * dst_stride),
-                    vreinterpret_u32_u8(d23), 0);
-      vst1_lane_u32((uint32_t *)(dst + 2 * dst_stride),
-                    vreinterpret_u32_u8(d01), 1);
-      vst1_lane_u32((uint32_t *)(dst + 3 * dst_stride),
-                    vreinterpret_u32_u8(d23), 1);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1355,7 +109,7 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   } else {
     int width;
     const uint8_t *s;
-    uint8x8_t t4, t5, t6, t7;
+    uint8x8_t t4, t5, t6, t7, d04, d15, d26, d37;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
@@ -1395,32 +149,24 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 0);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t1), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t2), 1);
-        dst += dst_stride;
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t3), 1);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
       } while (h > 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
 
       do {
@@ -1466,17 +212,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-          store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1505,8 +252,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   const int16x8_t filters = vld1q_s16(filter[x0_q4]);
   uint8x8_t t0, t1, t2, t3;
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(x_step_q4 == 16);
 
   (void)x_step_q4;
@@ -1516,10 +263,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3;
 
   if (h == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
@@ -1527,17 +272,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 3 * src_stride);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
-    tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-    tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-    tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-    tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-    s0 = vget_low_s16(tt0);
-    s1 = vget_low_s16(tt1);
-    s2 = vget_low_s16(tt2);
-    s3 = vget_low_s16(tt3);
-    s4 = vget_high_s16(tt0);
-    s5 = vget_high_s16(tt1);
-    s6 = vget_high_s16(tt2);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
     __builtin_prefetch(dst + 0 * dst_stride);
     __builtin_prefetch(dst + 1 * dst_stride);
     __builtin_prefetch(dst + 2 * dst_stride);
@@ -1547,35 +289,28 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     do {
       load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
       transpose_u8_8x4(&t0, &t1, &t2, &t3);
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s7 = vget_low_s16(tt0);
-      s8 = vget_low_s16(tt1);
-      s9 = vget_low_s16(tt2);
-      s10 = vget_low_s16(tt3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
       transpose_u8_4x4(&d01, &d23);
 
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
+      dd01 = load_u8(dst + 0 * dst_stride, 2 * dst_stride);
+      dd23 = load_u8(dst + 1 * dst_stride, 2 * dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
 
-      vst1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      vst1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 2);
-      vst1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 1);
-      vst1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
+      store_u8(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8(dst + 1 * dst_stride, 2 * dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1595,8 +330,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     if (w == 4) {
-      uint32x4_t d0415 = vdupq_n_u32(0);
-      uint32x4_t d2637 = vdupq_n_u32(0);
+      uint8x8_t d04, d15, d26, d37, dd04, dd15, dd26, dd37;
+
       do {
         load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
         transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
@@ -1633,48 +368,35 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0415, 0);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0415, 2);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d2637, 0);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d2637, 2);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 4 * dst_stride), d0415, 1);
-        d0415 = vld1q_lane_u32((uint32_t *)(dst + 5 * dst_stride), d0415, 3);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 6 * dst_stride), d2637, 1);
-        d2637 = vld1q_lane_u32((uint32_t *)(dst + 7 * dst_stride), d2637, 3);
-        d0415 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d0415), vcombine_u8(t0, t1)));
-        d2637 = vreinterpretq_u32_u8(
-            vrhaddq_u8(vreinterpretq_u8_u32(d2637), vcombine_u8(t2, t3)));
-
-        vst1q_lane_u32((uint32_t *)dst, d0415, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 0);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 2);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d0415, 3);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 1);
-        dst += dst_stride;
-        vst1q_lane_u32((uint32_t *)dst, d2637, 3);
-        dst += dst_stride;
+        d04 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d15 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d26 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d37 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        transpose_u8_8x4(&d04, &d15, &d26, &d37);
+
+        dd04 = load_u8(dst + 0 * dst_stride, 4 * dst_stride);
+        dd15 = load_u8(dst + 1 * dst_stride, 4 * dst_stride);
+        dd26 = load_u8(dst + 2 * dst_stride, 4 * dst_stride);
+        dd37 = load_u8(dst + 3 * dst_stride, 4 * dst_stride);
+
+        d04 = vrhadd_u8(d04, dd04);
+        d15 = vrhadd_u8(d15, dd15);
+        d26 = vrhadd_u8(d26, dd26);
+        d37 = vrhadd_u8(d37, dd37);
+
+        store_u8(dst + 0 * dst_stride, 4 * dst_stride, d04);
+        store_u8(dst + 1 * dst_stride, 4 * dst_stride, d15);
+        store_u8(dst + 2 * dst_stride, 4 * dst_stride, d26);
+        store_u8(dst + 3 * dst_stride, 4 * dst_stride, d37);
+
+        dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     } else {
       uint8_t *d;
+      uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7;
       int16x8_t s11, s12, s13, s14;
-      uint8x16_t d01, d23, d45, d67;
 
       do {
         __builtin_prefetch(src + 0 * src_stride);
@@ -1719,33 +441,27 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
-          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7);
+
+          d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+          d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+          d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+          d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+          d4 = vrhadd_u8(d4, vld1_u8(d + 4 * dst_stride));
+          d5 = vrhadd_u8(d5, vld1_u8(d + 5 * dst_stride));
+          d6 = vrhadd_u8(d6, vld1_u8(d + 6 * dst_stride));
+          d7 = vrhadd_u8(d7, vld1_u8(d + 7 * dst_stride));
 
-          d01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                            vld1_u8(d + 1 * dst_stride));
-          d23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                            vld1_u8(d + 3 * dst_stride));
-          d45 = vcombine_u8(vld1_u8(d + 4 * dst_stride),
-                            vld1_u8(d + 5 * dst_stride));
-          d67 = vcombine_u8(vld1_u8(d + 6 * dst_stride),
-                            vld1_u8(d + 7 * dst_stride));
-          d01 = vrhaddq_u8(d01, vcombine_u8(t0, t1));
-          d23 = vrhaddq_u8(d23, vcombine_u8(t2, t3));
-          d45 = vrhaddq_u8(d45, vcombine_u8(t4, t5));
-          d67 = vrhaddq_u8(d67, vcombine_u8(t6, t7));
-
-          store_u8_8x8(d, dst_stride, vget_low_u8(d01), vget_high_u8(d01),
-                       vget_low_u8(d23), vget_high_u8(d23), vget_low_u8(d45),
-                       vget_high_u8(d45), vget_low_u8(d67), vget_high_u8(d67));
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
           s0 = s8;
           s1 = s9;
@@ -1761,7 +477,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         src += 8 * src_stride;
         dst += 8 * dst_stride;
         h -= 8;
-      } while (h > 0);
+      } while (h != 0);
     }
   }
 }
@@ -1773,8 +489,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1784,33 +500,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1820,21 +529,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -1843,13 +547,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -1860,33 +566,26 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -1896,19 +595,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-        vst1_u8(d, t1);
-        d += dst_stride;
-        vst1_u8(d, t2);
-        d += dst_stride;
-        vst1_u8(d, t3);
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -1917,6 +610,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s4 = s8;
         s5 = s9;
         s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
         height -= 4;
       } while (height != 0);
       src += 8;
@@ -1933,8 +628,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                                  int h) {
   const int16x8_t filters = vld1q_s16(filter[y0_q4]);
 
-  assert(!((intptr_t)dst & 3));
-  assert(!(dst_stride & 3));
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
   assert(y_step_q4 == 16);
 
   (void)x0_q4;
@@ -1944,34 +639,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    uint8x8_t d01, d23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23, dd01, dd23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    uint32x4_t d0123 = vdupq_n_u32(0);
-
-    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
-    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-    src += src_stride;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+    s1 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+    s2 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+    s3 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
+    s4 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t4)));
+    s5 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t5)));
+    s6 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
 
     do {
-      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
-      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(vld1_u8(src))));
-      src += src_stride;
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t0)));
+      s8 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t1)));
+      s9 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t2)));
+      s10 = vreinterpret_s16_u16(vget_low_u16(vmovl_u8(t3)));
 
       __builtin_prefetch(dst + 0 * dst_stride);
       __builtin_prefetch(dst + 1 * dst_stride);
@@ -1981,29 +668,22 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
+
       d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
       d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
       d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
       d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
 
-      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
-      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
-
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 0 * dst_stride), d0123, 0);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 1 * dst_stride), d0123, 1);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 2 * dst_stride), d0123, 2);
-      d0123 = vld1q_lane_u32((uint32_t *)(dst + 3 * dst_stride), d0123, 3);
-      d0123 = vreinterpretq_u32_u8(
-          vrhaddq_u8(vreinterpretq_u8_u32(d0123), vcombine_u8(d01, d23)));
-
-      vst1q_lane_u32((uint32_t *)dst, d0123, 0);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 1);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 2);
-      dst += dst_stride;
-      vst1q_lane_u32((uint32_t *)dst, d0123, 3);
-      dst += dst_stride;
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       s0 = s4;
       s1 = s5;
@@ -2012,14 +692,15 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s4 = s8;
       s5 = s9;
       s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h != 0);
   } else {
     int height;
     const uint8_t *s;
     uint8_t *d;
-    uint8x8_t t0, t1, t2, t3;
-    uint8x16_t d01, d23, dd01, dd23;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
 
     do {
@@ -2030,33 +711,26 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 4 * src_stride);
       __builtin_prefetch(src + 5 * src_stride);
       __builtin_prefetch(src + 6 * src_stride);
-      s = src;
-      s0 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s3 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s4 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s5 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
-      s6 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-      s += src_stride;
+
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      s = src + 7 * src_stride;
       d = dst;
       height = h;
 
       do {
-        s7 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s8 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s9 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
-        s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s)));
-        s += src_stride;
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
 
         __builtin_prefetch(d + 0 * dst_stride);
         __builtin_prefetch(d + 1 * dst_stride);
@@ -2066,28 +740,18 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
-
-        d01 = vcombine_u8(t0, t1);
-        d23 = vcombine_u8(t2, t3);
-        dd01 = vcombine_u8(vld1_u8(d + 0 * dst_stride),
-                           vld1_u8(d + 1 * dst_stride));
-        dd23 = vcombine_u8(vld1_u8(d + 2 * dst_stride),
-                           vld1_u8(d + 3 * dst_stride));
-        dd01 = vrhaddq_u8(dd01, d01);
-        dd23 = vrhaddq_u8(dd23, d23);
-
-        vst1_u8(d, vget_low_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd01));
-        d += dst_stride;
-        vst1_u8(d, vget_low_u8(dd23));
-        d += dst_stride;
-        vst1_u8(d, vget_high_u8(dd23));
-        d += dst_stride;
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+
+        d0 = vrhadd_u8(d0, vld1_u8(d + 0 * dst_stride));
+        d1 = vrhadd_u8(d1, vld1_u8(d + 1 * dst_stride));
+        d2 = vrhadd_u8(d2, vld1_u8(d + 2 * dst_stride));
+        d3 = vrhadd_u8(d3, vld1_u8(d + 3 * dst_stride));
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s0 = s4;
         s1 = s5;
@@ -2097,6 +761,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5 = s9;
         s6 = s10;
         height -= 4;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
       } while (height != 0);
       src += 8;
       dst += 8;
@@ -2104,7 +770,3 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     } while (w != 0);
   }
 }
-
-#endif  // #if defined(__aarch64__) &&
-        //     (defined(__ARM_FEATURE_DOTPROD) ||
-        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index ed7f18053..025e943cc 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -15,10 +15,18 @@
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/vpx_filter.h"
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h);
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
                                                  const int32x4_t correction,
                                                  const int8x8_t filters) {
@@ -29,11 +37,11 @@ static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
   sum = vdotq_lane_s32(correction, samples_lo, filters, 0);
   sum = vdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
                                          const int8x8_t filters,
                                          const int32x4_t correction,
                                          const uint8x16_t range_limit,
@@ -54,8 +62,8 @@ static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
   sum = vdotq_lane_s32(correction, permuted_samples[0], filters, 0);
   sum = vdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
@@ -78,7 +86,7 @@ static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
@@ -111,14 +119,20 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
-static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h);
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
                                                   const int8x8_t filters) {
   /* Sample permutation is performed by the caller. */
@@ -127,11 +141,11 @@ static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
   sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
   sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
-static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples,
                                           const int8x8_t filters,
                                           const uint8x16x2_t permute_tbl) {
   uint8x16_t permuted_samples[2];
@@ -147,8 +161,8 @@ static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
   sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
   sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
 
-  /* Narrowing and packing is performed by the caller. */
-  return sum;
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
@@ -169,7 +183,7 @@ static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
@@ -196,10 +210,10 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
 
   /* Narrow and re-pack. */
   sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
@@ -238,7 +252,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
   sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
   sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
-  return vqrshrun_n_s16(sum, 7);
+  return vqrshrun_n_s16(sum, FILTER_BITS);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
new file mode 100644
index 000000000..bf01364cf
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_dotprod.c
@@ -0,0 +1,777 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+void vpx_convolve8_2d_horiz_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      d3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+    d1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+    d2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+      d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+      d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width != 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_dotprod(const uint8_t *src,
+                                          ptrdiff_t src_stride, uint8_t *dst,
+                                          ptrdiff_t dst_stride,
+                                          const InterpKernel *filter, int x0_q4,
+                                          int x_step_q4, int y0_q4,
+                                          int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[x0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filters, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filters, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filters, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filters, correction, range_limit, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const InterpKernel *filter, int x0_q4,
+                                     int x_step_q4, int y0_q4, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_dotprod(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *filter, int x0_q4,
+                                         int x_step_q4, int y0_q4,
+                                         int y_step_q4, int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter[y0_q4]), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
new file mode 100644
index 000000000..e0e482e3f
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve8_neon_i8mm.c
@@ -0,0 +1,698 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+void vpx_convolve8_2d_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+  assert(h % 4 == 3);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    do {
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      d3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    load_u8_16x3(src, src_stride, &s0, &s1, &s2);
+
+    d0 = convolve8_4_usdot(s0, filters, perm_tbl);
+    d1 = convolve8_4_usdot(s1, filters, perm_tbl);
+    d2 = convolve8_4_usdot(s2, filters, perm_tbl);
+    d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+    d23 = vqrshrun_n_s16(vcombine_s16(d2, vdup_n_s16(0)), FILTER_BITS);
+
+    store_u8(dst + 0 * dst_stride, dst_stride, d01);
+    store_u8_4x1(dst + 2 * dst_stride, d23);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 3);
+
+    /* Process final three rows (h % 4 == 3). See vpx_convolve_neon.c for
+     * further details on possible values of block height. */
+    width = w;
+    s = src;
+    d = dst;
+    do {
+      load_u8_16x3(s, src_stride, &s0, &s1, &s2);
+
+      d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+      d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+      d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+
+      store_u8_8x3(d, dst_stride, d0, d1, d2);
+
+      s += 8;
+      d += 8;
+      width -= 8;
+    } while (width > 0);
+  }
+}
+
+void vpx_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *filter, int x0_q4,
+                                   int x_step_q4, int y0_q4, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const InterpKernel *filter, int x0_q4,
+                                       int x_step_q4, int y0_q4, int y_step_q4,
+                                       int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23, dd01, dd23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filters, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filters, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filters, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filters, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filters, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filters, perm_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
+
+void vpx_convolve8_avg_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const InterpKernel *filter, int x0_q4,
+                                      int x_step_q4, int y0_q4, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c
index 361ec8a80..bea7c9843 100644
--- a/vpx_dsp/arm/vpx_convolve_copy_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c
@@ -9,6 +9,7 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
@@ -26,10 +27,10 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (w < 8) {  // copy4
     do {
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
-      *(uint32_t *)dst = *(const uint32_t *)src;
+      memcpy(dst, src, 4);
       src += src_stride;
       dst += dst_stride;
       h -= 2;
diff --git a/vpx_dsp/arm/vpx_convolve_neon_dotprod.c b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
new file mode 100644
index 000000000..400e26b30
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_dotprod.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
+                                      filter, x0_q4, x_step_q4, y0_q4,
+                                      y_step_q4, w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data. */
+  vpx_convolve8_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *filter, int x0_q4,
+                                    int x_step_q4, int y0_q4, int y_step_q4,
+                                    int w, int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_dotprod(src - src_stride * 3, src_stride, temp, w,
+                                      filter, x0_q4, x_step_q4, y0_q4,
+                                      y_step_q4, w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_dotprod(temp + w * 3, w, dst, dst_stride, filter,
+                                      x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/arm/vpx_convolve_neon_i8mm.c b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
new file mode 100644
index 000000000..4d94bb79b
--- /dev/null
+++ b/vpx_dsp/arm/vpx_convolve_neon_i8mm.c
@@ -0,0 +1,60 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+void vpx_convolve8_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
+                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
+                                   w, intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data. */
+  vpx_convolve8_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                               x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon_i8mm(src - src_stride * 3, src_stride, temp, w,
+                                   filter, x0_q4, x_step_q4, y0_q4, y_step_q4,
+                                   w, intermediate_height);
+
+  vpx_convolve8_avg_vert_neon_i8mm(temp + w * 3, w, dst, dst_stride, filter,
+                                   x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+}
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index 954015407..a8dcab7da 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -295,19 +295,19 @@ void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
     vpx_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
   }
 
-  // coeff: 15 bit, dynamic range [-16320, 16320]
+  // coeff: 16 bit, dynamic range [-32768, 32767]
   for (idx = 0; idx < 256; ++idx) {
     tran_low_t a0 = coeff[0];
     tran_low_t a1 = coeff[256];
     tran_low_t a2 = coeff[512];
     tran_low_t a3 = coeff[768];
 
-    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
     tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
-    tran_low_t b2 = (a2 + a3) >> 2;  // [-16320, 16320]
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
     tran_low_t b3 = (a2 - a3) >> 2;
 
-    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
     coeff[256] = b1 + b3;
     coeff[512] = b0 - b2;
     coeff[768] = b1 - b3;
@@ -428,7 +428,7 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
   int i, j;
   const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
   const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
-  *min = 255;
+  *min = 65535;
   *max = 0;
   for (i = 0; i < 8; ++i, s += p, d += dp) {
     for (j = 0; j < 8; ++j) {
diff --git a/vpx_dsp/loongarch/quantize_lsx.c b/vpx_dsp/loongarch/quantize_lsx.c
index 77be0bb4f..9bb1691e2 100644
--- a/vpx_dsp/loongarch/quantize_lsx.c
+++ b/vpx_dsp/loongarch/quantize_lsx.c
@@ -11,6 +11,8 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_util/loongson_intrinsics.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
                                        __m128i round, __m128i quant,
@@ -88,15 +90,15 @@ static INLINE int16_t accumulate_eob(__m128i eob) {
 }
 
 #if !CONFIG_VP9_HIGHBITDEPTH
+
 void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *zbin_ptr, const int16_t *round_ptr,
-                        const int16_t *quant_ptr,
-                        const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
-                        int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
   __m128i zero = __lsx_vldi(0);
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, quant_shift;
   __m128i coeff0, coeff1;
@@ -104,13 +106,11 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
-  zbin = __lsx_vld(zbin_ptr, 0);
-  round = __lsx_vld(round_ptr, 0);
-  quant = __lsx_vld(quant_ptr, 0);
+  zbin = __lsx_vld(mb_plane->zbin, 0);
+  round = __lsx_vld(mb_plane->round, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
   dequant = __lsx_vld(dequant_ptr, 0);
-  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
   // Handle one DC and first 15 AC.
   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
   qcoeff0 = __lsx_vabsd_h(coeff0, zero);
@@ -167,31 +167,27 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
-                              int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+void vpx_quantize_b_32x32_lsx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   __m128i zero = __lsx_vldi(0);
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, quant_shift;
   __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-
-  zbin = __lsx_vld(zbin_ptr, 0);
+  zbin = __lsx_vld(mb_plane->zbin, 0);
   zbin = __lsx_vsrari_h(zbin, 1);
-  round = __lsx_vld(round_ptr, 0);
+  round = __lsx_vld(mb_plane->round, 0);
   round = __lsx_vsrari_h(round, 1);
 
-  quant = __lsx_vld(quant_ptr, 0);
+  quant = __lsx_vld(mb_plane->quant, 0);
   dequant = __lsx_vld(dequant_ptr, 0);
-  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vld(mb_plane->quant_shift, 0);
   quant_shift = __lsx_vslli_h(quant_shift, 1);
   // Handle one DC and first 15 AC.
   DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index d54ce5368..53462b59f 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h
@@ -774,16 +774,16 @@
    Details     : 4 signed word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_SW_S32(in)                            \
-  ({                                               \
-    v2i64 res0_m, res1_m;                          \
-    int32_t sum_m;                                 \
-                                                   \
-    res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in); \
-    res1_m = __msa_splati_d(res0_m, 1);            \
-    res0_m = res0_m + res1_m;                      \
-    sum_m = __msa_copy_s_w((v4i32)res0_m, 0);      \
-    sum_m;                                         \
+#define HADD_SW_S32(in)                                               \
+  ({                                                                  \
+    v2i64 hadd_sw_s32_res0_m, hadd_sw_s32_res1_m;                     \
+    int32_t hadd_sw_s32_sum_m;                                        \
+                                                                      \
+    hadd_sw_s32_res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);        \
+    hadd_sw_s32_res1_m = __msa_splati_d(hadd_sw_s32_res0_m, 1);       \
+    hadd_sw_s32_res0_m = hadd_sw_s32_res0_m + hadd_sw_s32_res1_m;     \
+    hadd_sw_s32_sum_m = __msa_copy_s_w((v4i32)hadd_sw_s32_res0_m, 0); \
+    hadd_sw_s32_sum_m;                                                \
   })
 
 /* Description : Horizontal addition of 4 unsigned word elements
@@ -793,16 +793,16 @@
    Details     : 4 unsigned word elements of 'in' vector are added together and
                  the resulting integer sum is returned
 */
-#define HADD_UW_U32(in)                               \
-  ({                                                  \
-    v2u64 res0_m, res1_m;                             \
-    uint32_t sum_m;                                   \
-                                                      \
-    res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);    \
-    res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \
-    res0_m += res1_m;                                 \
-    sum_m = __msa_copy_u_w((v4i32)res0_m, 0);         \
-    sum_m;                                            \
+#define HADD_UW_U32(in)                                                       \
+  ({                                                                          \
+    v2u64 hadd_uw_u32_res0_m, hadd_uw_u32_res1_m;                             \
+    uint32_t hadd_uw_u32_sum_m;                                               \
+                                                                              \
+    hadd_uw_u32_res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in);                \
+    hadd_uw_u32_res1_m = (v2u64)__msa_splati_d((v2i64)hadd_uw_u32_res0_m, 1); \
+    hadd_uw_u32_res0_m += hadd_uw_u32_res1_m;                                 \
+    hadd_uw_u32_sum_m = __msa_copy_u_w((v4i32)hadd_uw_u32_res0_m, 0);         \
+    hadd_uw_u32_sum_m;                                                        \
   })
 
 /* Description : Horizontal addition of 8 unsigned halfword elements
@@ -812,14 +812,14 @@
    Details     : 8 unsigned halfword elements of 'in' vector are added
                  together and the resulting integer sum is returned
 */
-#define HADD_UH_U32(in)                           \
-  ({                                              \
-    v4u32 res_m;                                  \
-    uint32_t sum_m;                               \
-                                                  \
-    res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
-    sum_m = HADD_UW_U32(res_m);                   \
-    sum_m;                                        \
+#define HADD_UH_U32(in)                                       \
+  ({                                                          \
+    v4u32 hadd_uh_u32_res_m;                                  \
+    uint32_t hadd_uh_u32_sum_m;                               \
+                                                              \
+    hadd_uh_u32_res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \
+    hadd_uh_u32_sum_m = HADD_UW_U32(hadd_uh_u32_res_m);       \
+    hadd_uh_u32_sum_m;                                        \
   })
 
 /* Description : Horizontal addition of unsigned byte vector elements
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index be9614a35..6c6bc9a30 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -225,7 +225,7 @@ static INLINE void variance(const uint8_t *src_ptr, int src_stride,
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                         \
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
index f0d4e927a..4ee4130a2 100644
--- a/vpx_dsp/psnr.c
+++ b/vpx_dsp/psnr.c
@@ -45,14 +45,14 @@ static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride, int w,
-                                    int h) {
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+                                  const uint8_t *b8, int b_stride, int w,
+                                  int h) {
   int i, j;
   int64_t sse = 0;
 
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
@@ -88,10 +88,8 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
-    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_sse(pa, a_stride, pb, b_stride, 16, 16);
 
       pa += 16;
       pb += 16;
@@ -131,21 +129,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   if (dw > 0) {
-    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
-                                      b_stride, dw, height);
+    total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+                                    b_stride, dw, height);
   }
   if (dh > 0) {
-    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
-                                      &b[(height - dh) * b_stride], b_stride,
-                                      width - dw, dh);
+    total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+                                    &b[(height - dh) * b_stride], b_stride,
+                                    width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
-    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
+      total_sse += vpx_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
       pa += 16;
       pb += 16;
     }
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
index 9ebb64dd5..7c57aa429 100644
--- a/vpx_dsp/psnr.h
+++ b/vpx_dsp/psnr.h
@@ -26,7 +26,7 @@ typedef struct vpx_psnr_pkt PSNR_STATS;
 
 /*!\brief Converts SSE to PSNR
  *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
  *
  * \param[in]    samples       Number of samples
  * \param[in]    peak          Max sample value
diff --git a/vpx_dsp/quantize.c b/vpx_dsp/quantize.c
index 5d6ba64a8..fac9136f8 100644
--- a/vpx_dsp/quantize.c
+++ b/vpx_dsp/quantize.c
@@ -14,6 +14,8 @@
 #include "vpx_dsp/quantize.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
@@ -114,15 +116,17 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      const int16_t *zbin_ptr, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      const struct macroblock_plane *const mb_plane,
                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                      const int16_t *scan, const int16_t *iscan) {
+                      const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -162,16 +166,17 @@ void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             const int16_t *zbin_ptr, const int16_t *round_ptr,
-                             const int16_t *quant_ptr,
-                             const int16_t *quant_shift_ptr,
+                             const struct macroblock_plane *const mb_plane,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                             const int16_t *scan, const int16_t *iscan) {
+                             const struct ScanOrder *const scan_order) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
-  const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+  const int zbins[2] = { mb_plane->zbin[0], mb_plane->zbin[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
-  (void)iscan;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -208,21 +213,23 @@ void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 }
 #endif
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            const int16_t *zbin_ptr, const int16_t *round_ptr,
-                            const int16_t *quant_ptr,
-                            const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr,
+                            const struct macroblock_plane *const mb_plane,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                            const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+                            const struct ScanOrder *const scan_order) {
+  const int n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
-  int idx_arr[1024];
+  int idx_arr[32 * 32 /* n_coeffs */];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -269,19 +276,21 @@ void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 1),
-                         ROUND_POWER_OF_TWO(zbin_ptr[1], 1) };
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
+  const intptr_t n_coeffs = 32 * 32;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1),
+                         ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1) };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
+  const int16_t *scan = scan_order->scan;
 
   int idx = 0;
   int idx_arr[1024];
   int i, eob = -1;
-  (void)iscan;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index b47c43430..2a4c81d58 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -40,9 +40,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
   unsigned int vpx_sad##m##x##n##_avg_c(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[m * n]);                           \
+    DECLARE_ALIGNED(32, uint8_t, comp_pred[m * n]);                           \
     vpx_comp_avg_pred_c(comp_pred, second_pred, m, n, ref_ptr, ref_stride);   \
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
+  }                                                                           \
+  unsigned int vpx_sad_skip_##m##x##n##_c(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride, (m),     \
+                   (n / 2));                                                  \
   }
 
 // Compare |src_ptr| to 4 distinct references in |ref_array[4]|
@@ -54,6 +60,15 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
           vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_array[i], ref_stride); \
+  }                                                                            \
+  void vpx_sad_skip_##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,   \
+                                     const uint8_t *const ref_array[4],        \
+                                     int ref_stride, uint32_t sad_array[4]) {  \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * sad(src_ptr, 2 * src_stride, ref_array[i],            \
+                             2 * ref_stride, (m), (n / 2));                    \
+    }                                                                          \
   }
 
 /* clang-format off */
@@ -156,6 +171,12 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
     vpx_highbd_comp_avg_pred_c(comp_pred, CONVERT_TO_SHORTPTR(second_pred), m, \
                                n, CONVERT_TO_SHORTPTR(ref_ptr), ref_stride);   \
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
+  }                                                                            \
+  unsigned int vpx_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
   }
 
 #define highbd_sadMxNx4D(m, n)                                                 \
@@ -167,6 +188,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
       sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
                                                  ref_array[i], ref_stride);    \
     }                                                                          \
+  }                                                                            \
+  void vpx_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad_skip_##m##x##n##_c(                        \
+          src, src_stride, ref_array[i], ref_stride);                          \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/vpx_dsp/sse.c b/vpx_dsp/sse.c
new file mode 100644
index 000000000..6cb4b705f
--- /dev/null
+++ b/vpx_dsp/sse.c
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+
+int64_t vpx_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index ce1e8382b..1c476542f 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -156,7 +156,7 @@ static void var_filter_block2d_bil_second_pass(
       const uint8_t *second_pred) {                                          \
     uint16_t fdata3[(H + 1) * W];                                            \
     uint8_t temp2[H * W];                                                    \
-    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                              \
+    DECLARE_ALIGNED(32, uint8_t, temp3[H * W]);                              \
                                                                              \
     var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_stride, 1, H + 1, \
                                       W, bilinear_filters[x_offset]);        \
@@ -180,7 +180,7 @@ static void var_filter_block2d_bil_second_pass(
   }
 
 /* Identical to the variance call except it does not calculate the
- * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * sse - sum^2 / w*h and returns sse in addition to modifying the passed in
  * variable.
  */
 #define MSE(W, H)                                                        \
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index 755cb907d..ccdb2f90b 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -69,11 +69,15 @@ typedef struct variance_vtable {
 #if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  vpx_sad_fn_t sdsf;
   vpx_sad_avg_fn_t sdaf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  vpx_sad_multi_d_fn_t sdsx4df;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 1fd9495cf..93abf39ff 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -31,10 +31,15 @@ DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
 DSP_SRCS-yes += psnr.c
 DSP_SRCS-yes += psnr.h
+DSP_SRCS-yes += sse.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += fastssim.c
+DSP_SRCS-$(HAVE_NEON) += arm/sse_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sse_neon_dotprod.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/sse_sse4.c
+DSP_SRCS-$(HAVE_AVX2) += x86/sse_avx2.c
 endif
 
 ifeq ($(CONFIG_DECODERS),yes)
@@ -133,6 +138,10 @@ DSP_SRCS-yes += arm/vpx_convolve_copy_neon.c
 DSP_SRCS-yes += arm/vpx_convolve8_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_avg_neon.c
 DSP_SRCS-yes += arm/vpx_convolve_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve8_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/vpx_convolve_neon_dotprod.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve8_neon_i8mm.c
+DSP_SRCS-$(HAVE_NEON_I8MM) += arm/vpx_convolve_neon_i8mm.c
 endif  # HAVE_NEON
 endif  # HAVE_NEON_ASM
 
@@ -252,6 +261,7 @@ DSP_SRCS-yes            += inv_txfm.h
 DSP_SRCS-yes            += inv_txfm.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_txfm_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/inv_txfm_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/inv_wht_sse2.asm
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/inv_txfm_ssse3.c
@@ -342,6 +352,10 @@ DSP_SRCS-$(HAVE_SSE2)  += x86/avg_intrin_sse2.c
 DSP_SRCS-$(HAVE_AVX2)  += x86/avg_intrin_avx2.c
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_hadamard_neon.c
+DSP_SRCS-$(HAVE_NEON)  += arm/highbd_avg_neon.c
+endif
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
 DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
@@ -364,7 +378,9 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sum_squares_msa.c
 
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad4d_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD) += arm/sad_neon_dotprod.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
@@ -392,6 +408,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
@@ -406,6 +423,7 @@ DSP_SRCS-yes            += variance.h
 DSP_SRCS-$(HAVE_NEON)   += arm/avg_pred_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+DSP_SRCS-$(HAVE_NEON_DOTPROD)   += arm/variance_neon_dotprod.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
@@ -418,6 +436,7 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/avg_pred_avx2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/variance_vsx.c
@@ -432,7 +451,10 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_avg_pred_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_sse_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_subpel_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 2de449546..4b946d756 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,9 +45,21 @@ typedef int16_t tran_low_t;
 
 typedef int16_t tran_coef_t;
 
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code for clip_pixel() when the return type is uint8_t.
+// See:
+// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+static INLINE int clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+#else
 static INLINE uint8_t clip_pixel(int val) {
   return (val > 255) ? 255 : (val < 0) ? 0 : val;
 }
+#endif
 
 static INLINE int clamp(int value, int low, int high) {
   return value < low ? low : (value > high ? high : value);
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8725821b6..e9d63f6ef 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -17,6 +17,10 @@ print <<EOF
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
+#if CONFIG_VP9_ENCODER
+ struct macroblock_plane;
+ struct ScanOrder;
+#endif
 
 EOF
 }
@@ -38,7 +42,7 @@ if ($opts{arch} eq "x86_64") {
 #
 
 add_proto qw/void vpx_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_4x4 sse2/;
+specialize qw/vpx_d207_predictor_4x4 neon sse2/;
 
 add_proto qw/void vpx_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_4x4 neon sse2/;
@@ -46,7 +50,7 @@ specialize qw/vpx_d45_predictor_4x4 neon sse2/;
 add_proto qw/void vpx_d45e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_4x4 ssse3/;
+specialize qw/vpx_d63_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_d63e_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
@@ -57,12 +61,13 @@ specialize qw/vpx_h_predictor_4x4 neon dspr2 msa sse2/;
 add_proto qw/void vpx_he_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 
 add_proto qw/void vpx_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_4x4 neon/;
 
 add_proto qw/void vpx_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_4x4 ssse3/;
+specialize qw/vpx_d153_predictor_4x4 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_4x4 neon msa sse2/;
@@ -86,7 +91,7 @@ add_proto qw/void vpx_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t stride, co
 specialize qw/vpx_dc_128_predictor_4x4 msa neon sse2/;
 
 add_proto qw/void vpx_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_8x8 ssse3/;
+specialize qw/vpx_d207_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
@@ -94,19 +99,20 @@ specialize qw/vpx_d45_predictor_8x8 neon sse2/;
 
 add_proto qw/void vpx_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_d63_predictor_8x8 ssse3/;
+specialize qw/vpx_d63_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
 specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/;
 
 add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_8x8 neon/;
 
 add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_8x8 ssse3/;
+specialize qw/vpx_d153_predictor_8x8 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_8x8 neon msa sse2/;
@@ -129,24 +135,25 @@ add_proto qw/void vpx_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, co
 specialize qw/vpx_dc_128_predictor_8x8 neon msa sse2/;
 
 add_proto qw/void vpx_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_16x16 ssse3/;
+specialize qw/vpx_d207_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_16x16 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_16x16 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_16x16 neon/;
 
 add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_16x16 ssse3/;
+specialize qw/vpx_d153_predictor_16x16 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_16x16 neon msa sse2 vsx/;
@@ -167,24 +174,25 @@ add_proto qw/void vpx_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride,
 specialize qw/vpx_dc_128_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d207_predictor_32x32 ssse3/;
+specialize qw/vpx_d207_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d45_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d63_predictor_32x32 ssse3 vsx/;
+specialize qw/vpx_d63_predictor_32x32 neon ssse3 vsx/;
 
 add_proto qw/void vpx_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_h_predictor_32x32 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
+specialize qw/vpx_d117_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_d135_predictor_32x32 neon/;
 
 add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_d153_predictor_32x32 ssse3/;
+specialize qw/vpx_d153_predictor_32x32 neon ssse3/;
 
 add_proto qw/void vpx_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_v_predictor_32x32 neon msa sse2 vsx/;
@@ -207,25 +215,25 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/;
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d207_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_4x4 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d63_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d117_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_4x4 sse2/;
+  specialize qw/vpx_highbd_d153_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/;
@@ -246,25 +254,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_8x8 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_8x8 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/;
@@ -285,25 +293,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_16x16 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_16x16 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/;
@@ -324,25 +332,25 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/;
 
   add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d207_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d45_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d63_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d63_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_h_predictor_32x32 neon sse2/;
 
   add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d117_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d117_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_d135_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vpx_highbd_d153_predictor_32x32 ssse3/;
+  specialize qw/vpx_highbd_d153_predictor_32x32 neon ssse3/;
 
   add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/;
@@ -374,22 +382,22 @@ add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride,
 specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon neon_dotprod neon_i8mm dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;
@@ -589,7 +597,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa lsx/;
+  specialize qw/vpx_fdct16x16 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
@@ -633,12 +641,12 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   specialize qw/vpx_idct8x8_64_add neon sse2 vsx/;
   specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
   specialize qw/vpx_idct8x8_1_add neon sse2/;
-  specialize qw/vpx_idct16x16_256_add neon sse2 vsx/;
+  specialize qw/vpx_idct16x16_256_add neon sse2 avx2 vsx/;
   specialize qw/vpx_idct16x16_38_add neon sse2/;
   specialize qw/vpx_idct16x16_10_add neon sse2/;
   specialize qw/vpx_idct16x16_1_add neon sse2/;
-  specialize qw/vpx_idct32x32_1024_add neon sse2 vsx/;
-  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
+  specialize qw/vpx_idct32x32_1024_add neon sse2 avx2 vsx/;
+  specialize qw/vpx_idct32x32_135_add neon sse2 ssse3 avx2/;
   specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
   specialize qw/vpx_idct32x32_1_add neon sse2/;
   specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
@@ -714,17 +722,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
     specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const struct ScanOrder *const scan_order";
     specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
@@ -736,32 +744,35 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
 
+add_proto qw/int64_t/, "vpx_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+specialize qw/vpx_sse sse4_1 avx2 neon neon_dotprod/;
+
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x64 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x32 neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16 neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad16x16 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8 neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
@@ -778,6 +789,45 @@ specialize qw/vpx_sad4x8 neon msa sse2 mmi/;
 add_proto qw/unsigned int vpx_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad4x4 neon msa sse2 mmi/;
 
+add_proto qw/unsigned int vpx_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x64 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_64x32 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x64 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x32 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_32x16 neon neon_dotprod avx2 sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x32 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x16 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_16x8 neon neon_dotprod sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x16 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_8x4 neon/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x8 neon sse2/;
+
+add_proto qw/unsigned int vpx_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+specialize qw/vpx_sad_skip_4x4 neon/;
+
 #
 # Avg
 #
@@ -802,19 +852,19 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_8x8 avx2/;
+    specialize qw/vpx_highbd_hadamard_8x8 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_16x16 avx2/;
+    specialize qw/vpx_highbd_hadamard_16x16 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_highbd_hadamard_32x32 avx2/;
+    specialize qw/vpx_highbd_hadamard_32x32 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon/;
 
     add_proto qw/int vpx_highbd_satd/, "const tran_low_t *coeff, int length";
-    specialize qw/vpx_highbd_satd avx2/;
+    specialize qw/vpx_highbd_satd avx2 neon/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
@@ -830,38 +880,37 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   }
 
   add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
-  specialize qw/vpx_int_pro_row sse2 neon msa/;
-
+  specialize qw/vpx_int_pro_row neon sse2 msa/;
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
-  specialize qw/vpx_int_pro_col sse2 neon msa/;
+  specialize qw/vpx_int_pro_col neon sse2 msa/;
 
   add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
   specialize qw/vpx_vector_var neon sse2 msa/;
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad64x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x64_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/;
+specialize qw/vpx_sad32x32_avg neon neon_dotprod avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x16_avg neon neon_dotprod avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x32_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x8_avg neon neon_dotprod msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad8x16_avg neon msa sse2 mmi/;
@@ -881,45 +930,84 @@ specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x32x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x64x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x32x4d avx2 neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x16x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x32x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x16x4d neon neon_dotprod msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x8x4d neon neon_dotprod msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
+add_proto qw/void vpx_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x64x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_64x32x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x64x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x32x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_32x16x4d neon neon_dotprod avx2 sse2/;
+
+add_proto qw/void vpx_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x32x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x16x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_16x8x4d neon neon_dotprod sse2/;
+
+add_proto qw/void vpx_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x16x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_8x4x4d neon/;
+
+add_proto qw/void vpx_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x8x4d neon sse2/;
+
+add_proto qw/void vpx_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad_skip_4x4x4d neon/;
+
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
 specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
 
@@ -941,6 +1029,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
   specialize qw/vpx_highbd_subtract_block neon avx2/;
 
+  add_proto qw/int64_t/, "vpx_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+  specialize qw/vpx_highbd_sse sse4_1 avx2 neon/;
+
   #
   # Single block SAD
   #
@@ -983,16 +1074,56 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
   specialize qw/vpx_highbd_sad4x4 neon/;
 
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_64x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x64 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_32x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x32 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x16 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_16x8 neon sse2 avx2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x16 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x8 neon sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_8x4 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x8 neon/;
+
+  add_proto qw/unsigned int vpx_highbd_sad_skip_4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad_skip_4x4 neon/;
+
   #
   # Avg
   #
   add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_8x8 sse2/;
+  specialize qw/vpx_highbd_avg_8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *s8, int p";
-  specialize qw/vpx_highbd_avg_4x4 sse2/;
+  specialize qw/vpx_highbd_avg_4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
+  specialize qw/vpx_highbd_minmax_8x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
   specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
@@ -1036,45 +1167,84 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
+  add_proto qw/void vpx_highbd_sad_skip_64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x64x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_64x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x64x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_32x16x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x32x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x16x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_16x8x4d neon sse2 avx2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x16x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x8x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_8x4x4d neon/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x8x4d neon sse2/;
+
+  add_proto qw/void vpx_highbd_sad_skip_4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+  specialize qw/vpx_highbd_sad_skip_4x4x4d neon/;
+
   #
   # Structured Similarity (SSIM)
   #
@@ -1090,73 +1260,73 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x64 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x32 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x16 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_variance8x8 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x4 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance4x4 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance4x4 sse2 neon neon_dotprod msa mmi vsx/;
 
 #
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon neon_dotprod msa vsx lsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa vsx/;
+  specialize qw/vpx_get8x8var sse2 neon neon_dotprod msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon neon_dotprod msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
+  specialize qw/vpx_mse16x8 sse2 avx2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x16 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa mmi vsx/;
+  specialize qw/vpx_mse8x8 sse2 neon neon_dotprod msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
   specialize qw/vpx_get_mb_ss sse2 msa vsx/;
 
 add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa vsx/;
+  specialize qw/vpx_get4x4sse_cs neon neon_dotprod msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 avx2 vsx lsx/;
 
 #
 # Subpixel Variance
diff --git a/vpx_dsp/x86/avg_intrin_avx2.c b/vpx_dsp/x86/avg_intrin_avx2.c
index b2e01319d..61e4e73c5 100644
--- a/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/vpx_dsp/x86/avg_intrin_avx2.c
@@ -218,6 +218,14 @@ void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+                                                   __m256i *out_lo,
+                                                   __m256i *out_hi) {
+  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
 static void hadamard_col8x2_avx2(__m256i *in, int iter) {
   __m256i a0 = in[0];
   __m256i a1 = in[1];
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m256i b0, b1, b2, b3;
+  const __m256i zero = _mm256_setzero_si256();
   for (idx = 0; idx < 4; ++idx) {
     // src_diff: 9 bit, dynamic range [-255, 255]
     const int16_t *src_ptr =
@@ -414,15 +428,38 @@ void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
     const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
     const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
 
-    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
-    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
-    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
-    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm256_srai_epi32(b0_lo, 2);
+    b1_lo = _mm256_srai_epi32(b1_lo, 2);
+    b2_lo = _mm256_srai_epi32(b2_lo, 2);
+    b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm256_srai_epi32(b0_hi, 2);
+    b1_hi = _mm256_srai_epi32(b1_hi, 2);
+    b2_hi = _mm256_srai_epi32(b2_hi, 2);
+    b3_hi = _mm256_srai_epi32(b3_hi, 2);
 
-    b0 = _mm256_srai_epi16(b0, 2);
-    b1 = _mm256_srai_epi16(b1, 2);
-    b2 = _mm256_srai_epi16(b2, 2);
-    b3 = _mm256_srai_epi16(b3, 2);
+    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
 
     store_tran_low(_mm256_add_epi16(b0, b2), coeff);
     store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index 015c11a1f..4447dfab7 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -15,6 +15,14 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_ports/mem.h"
 
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+                                                   __m128i *out_lo,
+                                                   __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
 void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
                          int *min, int *max) {
   __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
@@ -400,6 +408,12 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
   int16_t *t_coeff = coeff;
 #endif
   int idx;
+  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m128i b0, b1, b2, b3;
+  const __m128i zero = _mm_setzero_si128();
   for (idx = 0; idx < 4; ++idx) {
     const int16_t *src_ptr =
         src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
@@ -413,15 +427,38 @@ void vpx_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
     __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
     __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
 
-    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
-    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
-    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
-    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm_srai_epi32(b0_lo, 2);
+    b1_lo = _mm_srai_epi32(b1_lo, 2);
+    b2_lo = _mm_srai_epi32(b2_lo, 2);
+    b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm_srai_epi32(b0_hi, 2);
+    b1_hi = _mm_srai_epi32(b1_hi, 2);
+    b2_hi = _mm_srai_epi32(b2_hi, 2);
+    b3_hi = _mm_srai_epi32(b3_hi, 2);
 
-    b0 = _mm_srai_epi16(b0, 2);
-    b1 = _mm_srai_epi16(b1, 2);
-    b2 = _mm_srai_epi16(b2, 2);
-    b3 = _mm_srai_epi16(b3, 2);
+    b0 = _mm_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm_packs_epi32(b3_lo, b3_hi);
 
     coeff0 = _mm_add_epi16(b0, b2);
     coeff1 = _mm_add_epi16(b1, b3);
diff --git a/vpx_dsp/x86/avg_pred_avx2.c b/vpx_dsp/x86/avg_pred_avx2.c
new file mode 100644
index 000000000..f4357998c
--- /dev/null
+++ b/vpx_dsp/x86/avg_pred_avx2.c
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  // comp_pred and pred must be 32 byte aligned.
+  assert(((intptr_t)comp_pred % 32) == 0);
+  assert(((intptr_t)pred % 32) == 0);
+
+  if (width == 8) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i p = _mm256_load_si256((const __m256i *)pred);
+      const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref);
+      const __m128i r_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+
+      const __m128i r1 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride)));
+      const __m128i r2 = _mm_castps_si128(_mm_loadh_pi(
+          _mm_castsi128_ps(r_1), (const __m64 *)(ref + 3 * ref_stride)));
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(r1), r2, 1);
+      const __m256i avg = _mm256_avg_epu8(p, ref_0123);
+
+      _mm256_store_si256((__m256i *)comp_pred, avg);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    assert(height % 4 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)ref));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    do {
+      const __m256i pred_0 = _mm256_load_si256((const __m256i *)pred);
+      const __m256i pred_1 = _mm256_load_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)ref);
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_store_si256((__m256i *)comp_pred, average_0);
+      _mm256_store_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      int x;
+      for (x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_load_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_load_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_store_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_store_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    vpx_comp_avg_pred_sse2(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
diff --git a/vpx_dsp/x86/fwd_txfm_avx2.c b/vpx_dsp/x86/fwd_txfm_avx2.c
index a2ed420e3..c8f54a49c 100644
--- a/vpx_dsp/x86/fwd_txfm_avx2.c
+++ b/vpx_dsp/x86/fwd_txfm_avx2.c
@@ -8,9 +8,382 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <immintrin.h>  // AVX2
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_dsp/txfm_common.h"
+#define ADD256_EPI16 _mm256_add_epi16
+#define SUB256_EPI16 _mm256_sub_epi16
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size, int pass) {
+  int i;
+  const __m256i kOne = _mm256_set1_epi16(1);
+  if (pass == 0) {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * stride));
+      // x = x << 2
+      out[i] = _mm256_slli_epi16(out[i], 2);
+    }
+  } else {
+    for (i = 0; i < out_size; i++) {
+      out[i] = _mm256_loadu_si256((const __m256i *)(in + i * 16));
+      // x = (x + 1) >> 2
+      out[i] = _mm256_add_epi16(out[i], kOne);
+      out[i] = _mm256_srai_epi16(out[i], 2);
+    }
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+// Store 8 16-bit values. Sign extend the values.
+static INLINE void store_buffer_16bit_to_32bit_w16_avx2(const __m256i *const in,
+                                                        tran_low_t *out,
+                                                        const int stride,
+                                                        const int out_size) {
+  int i;
+  for (i = 0; i < out_size; ++i) {
+    _mm256_storeu_si256((__m256i *)(out), in[i]);
+    out += stride;
+  }
+}
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE __m256i mult256_round_shift(const __m256i *pin0,
+                                          const __m256i *pin1,
+                                          const __m256i *pmultiplier,
+                                          const __m256i *prounding,
+                                          const int shift) {
+  const __m256i u0 = _mm256_madd_epi16(*pin0, *pmultiplier);
+  const __m256i u1 = _mm256_madd_epi16(*pin1, *pmultiplier);
+  const __m256i v0 = _mm256_add_epi32(u0, *prounding);
+  const __m256i v1 = _mm256_add_epi32(u1, *prounding);
+  const __m256i w0 = _mm256_srai_epi32(v0, shift);
+  const __m256i w1 = _mm256_srai_epi32(v1, shift);
+  return _mm256_packs_epi32(w0, w1);
+}
+
+static INLINE void fdct16x16_1D_avx2(__m256i *input, __m256i *output) {
+  int i;
+  __m256i step2[4];
+  __m256i in[8];
+  __m256i step1[8];
+  __m256i step3[8];
+
+  const __m256i k__cospi_p16_p16 = _mm256_set1_epi16(cospi_16_64);
+  const __m256i k__cospi_p16_m16 = PAIR256_SET_EPI16(cospi_16_64, -cospi_16_64);
+  const __m256i k__cospi_p24_p08 = PAIR256_SET_EPI16(cospi_24_64, cospi_8_64);
+  const __m256i k__cospi_p08_m24 = PAIR256_SET_EPI16(cospi_8_64, -cospi_24_64);
+  const __m256i k__cospi_m08_p24 = PAIR256_SET_EPI16(-cospi_8_64, cospi_24_64);
+  const __m256i k__cospi_p28_p04 = PAIR256_SET_EPI16(cospi_28_64, cospi_4_64);
+  const __m256i k__cospi_m04_p28 = PAIR256_SET_EPI16(-cospi_4_64, cospi_28_64);
+  const __m256i k__cospi_p12_p20 = PAIR256_SET_EPI16(cospi_12_64, cospi_20_64);
+  const __m256i k__cospi_m20_p12 = PAIR256_SET_EPI16(-cospi_20_64, cospi_12_64);
+  const __m256i k__cospi_p30_p02 = PAIR256_SET_EPI16(cospi_30_64, cospi_2_64);
+  const __m256i k__cospi_p14_p18 = PAIR256_SET_EPI16(cospi_14_64, cospi_18_64);
+  const __m256i k__cospi_m02_p30 = PAIR256_SET_EPI16(-cospi_2_64, cospi_30_64);
+  const __m256i k__cospi_m18_p14 = PAIR256_SET_EPI16(-cospi_18_64, cospi_14_64);
+  const __m256i k__cospi_p22_p10 = PAIR256_SET_EPI16(cospi_22_64, cospi_10_64);
+  const __m256i k__cospi_p06_p26 = PAIR256_SET_EPI16(cospi_6_64, cospi_26_64);
+  const __m256i k__cospi_m10_p22 = PAIR256_SET_EPI16(-cospi_10_64, cospi_22_64);
+  const __m256i k__cospi_m26_p06 = PAIR256_SET_EPI16(-cospi_26_64, cospi_6_64);
+  const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
+
+  // Calculate input for the first 8 results.
+  for (i = 0; i < 8; i++) {
+    in[i] = ADD256_EPI16(input[i], input[15 - i]);
+  }
+
+  // Calculate input for the next 8 results.
+  for (i = 0; i < 8; i++) {
+    step1[i] = SUB256_EPI16(input[7 - i], input[8 + i]);
+  }
+
+  // Work on the first eight values; fdct8(input, even_results);
+  {
+    // Add/subtract
+    const __m256i q0 = ADD256_EPI16(in[0], in[7]);
+    const __m256i q1 = ADD256_EPI16(in[1], in[6]);
+    const __m256i q2 = ADD256_EPI16(in[2], in[5]);
+    const __m256i q3 = ADD256_EPI16(in[3], in[4]);
+    const __m256i q4 = SUB256_EPI16(in[3], in[4]);
+    const __m256i q5 = SUB256_EPI16(in[2], in[5]);
+    const __m256i q6 = SUB256_EPI16(in[1], in[6]);
+    const __m256i q7 = SUB256_EPI16(in[0], in[7]);
+
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m256i r0 = ADD256_EPI16(q0, q3);
+      const __m256i r1 = ADD256_EPI16(q1, q2);
+      const __m256i r2 = SUB256_EPI16(q1, q2);
+      const __m256i r3 = SUB256_EPI16(q0, q3);
+
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      {
+        const __m256i t0 = _mm256_unpacklo_epi16(r0, r1);
+        const __m256i t1 = _mm256_unpackhi_epi16(r0, r1);
+        const __m256i t2 = _mm256_unpacklo_epi16(r2, r3);
+        const __m256i t3 = _mm256_unpackhi_epi16(r2, r3);
+
+        output[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[8] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[4] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                        &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        output[12] =
+            mult256_round_shift(&t2, &t3, &k__cospi_m08_p24,
+                                &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      }
+    }
+
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us
+      // into 32 bits.
+      const __m256i d0 = _mm256_unpacklo_epi16(q6, q5);
+      const __m256i d1 = _mm256_unpackhi_epi16(q6, q5);
+      const __m256i r0 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_m16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      const __m256i r1 = mult256_round_shift(
+          &d0, &d1, &k__cospi_p16_p16, &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+
+      {
+        // Add/subtract
+        const __m256i x0 = ADD256_EPI16(q4, r0);
+        const __m256i x1 = SUB256_EPI16(q4, r0);
+        const __m256i x2 = SUB256_EPI16(q7, r1);
+        const __m256i x3 = ADD256_EPI16(q7, r1);
+
+        // Interleave to do the multiply by constants which gets us
+        // into 32 bits.
+        {
+          const __m256i t0 = _mm256_unpacklo_epi16(x0, x3);
+          const __m256i t1 = _mm256_unpackhi_epi16(x0, x3);
+          const __m256i t2 = _mm256_unpacklo_epi16(x1, x2);
+          const __m256i t3 = _mm256_unpackhi_epi16(x1, x2);
+          output[2] =
+              mult256_round_shift(&t0, &t1, &k__cospi_p28_p04,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[14] =
+              mult256_round_shift(&t0, &t1, &k__cospi_m04_p28,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[10] =
+              mult256_round_shift(&t2, &t3, &k__cospi_p12_p20,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+          output[6] =
+              mult256_round_shift(&t2, &t3, &k__cospi_m20_p12,
+                                  &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+        }
+      }
+    }
+  }
+  // Work on the next eight values; step1 -> odd_results
+  {  // step 2
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[5], step1[2]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[5], step1[2]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[4], step1[3]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[4], step1[3]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p16_m16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p16_p16,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 3
+    {
+      step3[0] = ADD256_EPI16(step1[0], step2[1]);
+      step3[1] = ADD256_EPI16(step1[1], step2[0]);
+      step3[2] = SUB256_EPI16(step1[1], step2[0]);
+      step3[3] = SUB256_EPI16(step1[0], step2[1]);
+      step3[4] = SUB256_EPI16(step1[7], step2[3]);
+      step3[5] = SUB256_EPI16(step1[6], step2[2]);
+      step3[6] = ADD256_EPI16(step1[6], step2[2]);
+      step3[7] = ADD256_EPI16(step1[7], step2[3]);
+    }
+    // step 4
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step3[1], step3[6]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step3[1], step3[6]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step3[2], step3[5]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step3[2], step3[5]);
+      step2[0] = mult256_round_shift(&t0, &t1, &k__cospi_m08_p24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[1] = mult256_round_shift(&t2, &t3, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[2] = mult256_round_shift(&t0, &t1, &k__cospi_p24_p08,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      step2[3] = mult256_round_shift(&t2, &t3, &k__cospi_p08_m24,
+                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    // step 5
+    {
+      step1[0] = ADD256_EPI16(step3[0], step2[0]);
+      step1[1] = SUB256_EPI16(step3[0], step2[0]);
+      step1[2] = ADD256_EPI16(step3[3], step2[1]);
+      step1[3] = SUB256_EPI16(step3[3], step2[1]);
+      step1[4] = SUB256_EPI16(step3[4], step2[3]);
+      step1[5] = ADD256_EPI16(step3[4], step2[3]);
+      step1[6] = SUB256_EPI16(step3[7], step2[2]);
+      step1[7] = ADD256_EPI16(step3[7], step2[2]);
+    }
+    // step 6
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[0], step1[7]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[0], step1[7]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[1], step1[6]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[1], step1[6]);
+      output[1] = mult256_round_shift(&t0, &t1, &k__cospi_p30_p02,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[9] = mult256_round_shift(&t2, &t3, &k__cospi_p14_p18,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[15] = mult256_round_shift(&t0, &t1, &k__cospi_m02_p30,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[7] = mult256_round_shift(&t2, &t3, &k__cospi_m18_p14,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+    {
+      const __m256i t0 = _mm256_unpacklo_epi16(step1[2], step1[5]);
+      const __m256i t1 = _mm256_unpackhi_epi16(step1[2], step1[5]);
+      const __m256i t2 = _mm256_unpacklo_epi16(step1[3], step1[4]);
+      const __m256i t3 = _mm256_unpackhi_epi16(step1[3], step1[4]);
+      output[5] = mult256_round_shift(&t0, &t1, &k__cospi_p22_p10,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[13] = mult256_round_shift(&t2, &t3, &k__cospi_p06_p26,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[11] = mult256_round_shift(&t0, &t1, &k__cospi_m10_p22,
+                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+      output[3] = mult256_round_shift(&t2, &t3, &k__cospi_m26_p06,
+                                      &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
+    }
+  }
+}
+
+void vpx_fdct16x16_avx2(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  DECLARE_ALIGNED(32, int16_t, intermediate[256]);
+  int16_t *out0 = intermediate;
+  tran_low_t *out1 = output;
+  const int width = 16;
+  const int height = 16;
+  __m256i buf0[16], buf1[16];
+
+  // Two transform and transpose passes
+  // Process 16 columns (transposed rows in second pass) at a time.
+  for (pass = 0; pass < 2; ++pass) {
+    // Load and pre-condition input.
+    load_buffer_16bit_to_16bit_avx2(input, stride, buf1, height, pass);
+
+    // Calculate dct for 16x16 values
+    fdct16x16_1D_avx2(buf1, buf0);
+
+    // Transpose the results.
+    transpose_16bit_16x16_avx2(buf0, buf1);
+
+    if (pass == 0) {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out0, width, height);
+    } else {
+      store_buffer_16bit_to_32bit_w16_avx2(buf1, out1, width, height);
+    }
+    // Setup in/out for next pass.
+    input = intermediate;
+  }
+}
+
 #if !CONFIG_VP9_HIGHBITDEPTH
 #define FDCT32x32_2D_AVX2 vpx_fdct32x32_rd_avx2
 #define FDCT32x32_HIGH_PRECISION 0
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
index 8edddd637..35ca55404 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -11,6 +11,8 @@
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
   const __m128i sign = _mm_srai_epi16(*p, 15);
@@ -26,17 +28,15 @@ static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
   }
 }
 
-static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
-                                     const int16_t *round_ptr,
-                                     const int16_t *quant_ptr,
-                                     const int16_t *dequant_ptr,
-                                     const int16_t *quant_shift_ptr,
-                                     __m256i *qp, int log_scale) {
-  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
-  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
-  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+static VPX_FORCE_INLINE void init_qp(
+    const struct macroblock_plane *const mb_plane, const int16_t *dequant_ptr,
+    __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)mb_plane->zbin);
+  const __m128i round = _mm_loadu_si128((const __m128i *)mb_plane->round);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)mb_plane->quant);
   const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
-  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  const __m128i quant_shift =
+      _mm_loadu_si128((const __m128i *)mb_plane->quant_shift);
   init_one_qp(&zbin, &qp[0]);
   init_one_qp(&round, &qp[1]);
   init_one_qp(&quant, &qp[2]);
@@ -134,19 +134,16 @@ static VPX_FORCE_INLINE void quantize(const __m256i *qp,
 }
 
 void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const int step = 8;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+  init_qp(mb_plane, dequant_ptr, qp, 0);
 
   quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
@@ -222,17 +219,16 @@ static VPX_FORCE_INLINE void quantize_b_32x32(
 }
 
 void vpx_highbd_quantize_b_32x32_avx2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   const unsigned int step = 8;
+  intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
   __m256i eob = _mm256_setzero_si256();
   __m256i qp[5];
-  (void)scan;
 
-  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+  init_qp(mb_plane, dequant_ptr, qp, 1);
 
   quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
 
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index ae1981a83..adae60756 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -15,19 +15,22 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
-#if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+                                const struct macroblock_plane *mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
+  const int16_t *iscan = scan_order->iscan;
+  const int16_t *zbin_ptr = mb_plane->zbin;
+  const int16_t *round_ptr = mb_plane->round;
+  const int16_t *quant_ptr = mb_plane->quant;
+  const int16_t *quant_shift_ptr = mb_plane->quant_shift;
 
   zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
                            (int)zbin_ptr[0]);
@@ -38,8 +41,6 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
   nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
-  (void)scan;
-
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
 
@@ -93,19 +94,18 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, const struct macroblock_plane *const mb_plane,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+    uint16_t *eob_ptr, const struct ScanOrder *const scan_order) {
   __m128i zbins[2];
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
   int i, eob = 0;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
+  const intptr_t n_coeffs = 32 * 32;
+  const int16_t *iscan = scan_order->iscan;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(mb_plane->zbin[1], 1);
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
@@ -140,14 +140,14 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int coeff = coeff_ptr[rc];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(mb_plane->round[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * mb_plane->quant[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+        (uint32_t)((tmp2 * mb_plane->quant_shift[rc != 0]) >> 15);
     qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
   *eob_ptr = eob;
 }
-#endif
diff --git a/vpx_dsp/x86/highbd_sad4d_avx2.c b/vpx_dsp/x86/highbd_sad4d_avx2.c
index 947b5e977..e483fdce7 100644
--- a/vpx_dsp/x86/highbd_sad4d_avx2.c
+++ b/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -61,70 +61,79 @@ static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad64xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 2); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 1;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD64XNX4D(n)                                                   \
-  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 2); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);        \
-                                                                               \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 1;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad64xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 64x64
-HIGHBD_SAD64XNX4D(64)
-
-// 64x32
-HIGHBD_SAD64XNX4D(32)
+#define HIGHBD_SADSKIP64XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_64x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad64xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -171,73 +180,79 @@ static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
+static VPX_FORCE_INLINE void highbd_sad32xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < (n / 8); ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 3;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
 #define HIGHBD_SAD32XNX4D(n)                                                   \
-  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
                                       const uint8_t *const ref_array[4],       \
                                       int ref_stride, uint32_t sad_array[4]) { \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
-    uint16_t *refs[4];                                                         \
-    __m256i sums_16[4];                                                        \
-    __m256i sums_32[4];                                                        \
-    int i;                                                                     \
-                                                                               \
-    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
-    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
-    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
-    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
-    sums_32[0] = _mm256_setzero_si256();                                       \
-    sums_32[1] = _mm256_setzero_si256();                                       \
-    sums_32[2] = _mm256_setzero_si256();                                       \
-    sums_32[3] = _mm256_setzero_si256();                                       \
-                                                                               \
-    for (i = 0; i < (n / 8); ++i) {                                            \
-      sums_16[0] = _mm256_setzero_si256();                                     \
-      sums_16[1] = _mm256_setzero_si256();                                     \
-      sums_16[2] = _mm256_setzero_si256();                                     \
-      sums_16[3] = _mm256_setzero_si256();                                     \
-                                                                               \
-      highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);        \
-                                                                               \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to         \
-       * sums_32*/                                                             \
-      sums_32[0] = _mm256_add_epi32(                                           \
-          sums_32[0],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
-      sums_32[1] = _mm256_add_epi32(                                           \
-          sums_32[1],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
-      sums_32[2] = _mm256_add_epi32(                                           \
-          sums_32[2],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
-      sums_32[3] = _mm256_add_epi32(                                           \
-          sums_32[3],                                                          \
-          _mm256_add_epi32(                                                    \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
-              _mm256_cvtepu16_epi32(                                           \
-                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
-                                                                               \
-      src += src_stride << 3;                                                  \
-    }                                                                          \
-    calc_final_4(sums_32, sad_array);                                          \
+    highbd_sad32xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
   }
 
-// 32x64
-HIGHBD_SAD32XNX4D(64)
-
-// 32x32
-HIGHBD_SAD32XNX4D(32)
-
-// 32x16
-HIGHBD_SAD32XNX4D(16)
+#define HIGHBD_SADSKIP32XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_32x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad32xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
                                                const uint16_t *src,
@@ -275,13 +290,15 @@ static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
   }
 }
 
-void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                                 const uint8_t *const ref_array[4],
-                                 int ref_stride, uint32_t sad_array[4]) {
+static VPX_FORCE_INLINE void highbd_sad16xNx4d_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4],
+    int ref_stride, uint32_t sad_array[4], int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *refs[4];
   __m256i sums_16[4];
   __m256i sums_32[4];
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
   refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
@@ -293,13 +310,13 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums_32[2] = _mm256_setzero_si256();
   sums_32[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     sums_16[0] = _mm256_setzero_si256();
     sums_16[1] = _mm256_setzero_si256();
     sums_16[2] = _mm256_setzero_si256();
     sums_16[3] = _mm256_setzero_si256();
 
-    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32[0] = _mm256_add_epi32(
@@ -328,6 +345,26 @@ void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums_32, sad_array);
 }
 
+#define HIGHBD_SAD16XNX4D(n)                                                   \
+  void vpx_highbd_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    highbd_sad16xNx4d_avx2(src, src_stride, ref_array, ref_stride, sad_array,  \
+                           n);                                                 \
+  }
+
+#define HIGHBD_SADSKIP16XNx4D(n)                                             \
+  void vpx_highbd_sad_skip_16x##n##x4d_avx2(                                 \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad16xNx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,   \
+                           sad_array, n / 2);                                \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
 void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
                                  const uint8_t *const ref_array[4],
                                  int ref_stride, uint32_t sad_array[4]) {
@@ -399,3 +436,27 @@ void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
     calc_final_4(sums_32, sad_array);
   }
 }
+
+// clang-format off
+HIGHBD_SAD64XNX4D(64)
+HIGHBD_SADSKIP64XNx4D(64)
+
+HIGHBD_SAD64XNX4D(32)
+HIGHBD_SADSKIP64XNx4D(32)
+
+HIGHBD_SAD32XNX4D(64)
+HIGHBD_SADSKIP32XNx4D(64)
+
+HIGHBD_SAD32XNX4D(32)
+HIGHBD_SADSKIP32XNx4D(32)
+
+HIGHBD_SAD32XNX4D(16)
+HIGHBD_SADSKIP32XNx4D(16)
+
+HIGHBD_SAD16XNX4D(32)
+HIGHBD_SADSKIP16XNx4D(32)
+
+HIGHBD_SADSKIP16XNx4D(16)
+
+HIGHBD_SADSKIP16XNx4D(8)
+    // clang-format on
diff --git a/vpx_dsp/x86/highbd_sad4d_sse2.asm b/vpx_dsp/x86/highbd_sad4d_sse2.asm
index 6c2a61e01..a07892d81 100644
--- a/vpx_dsp/x86/highbd_sad4d_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad4d_sse2.asm
@@ -213,7 +213,12 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8
-%macro HIGH_SADNXN4D 2
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
 %if UNIX64
 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -221,6 +226,15 @@ cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%else  ; %3 == 2, downsample
+%if UNIX64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ;
+%endif  ; sad/avg/skip
 
 ; set m1
   push                srcq
@@ -229,6 +243,10 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   pshufd                m1, m1, 0x0
   pop                 srcq
 
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -244,9 +262,15 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   shl                ref1q, 1
 
   HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef rep
   HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
   ; N.B. HIGH_PROCESS outputs dwords (32 bits)
   ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
@@ -265,6 +289,9 @@ cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   paddd                 m4, m0
   paddd                 m6, m1
   punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
   movifnidn             r4, r4mp
   movu                [r4], m4
   RET
@@ -285,3 +312,15 @@ HIGH_SADNXN4D  8,  8
 HIGH_SADNXN4D  8,  4
 HIGH_SADNXN4D  4,  8
 HIGH_SADNXN4D  4,  4
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
diff --git a/vpx_dsp/x86/highbd_sad_avx2.c b/vpx_dsp/x86/highbd_sad_avx2.c
index 231b67f80..78f8eb8bf 100644
--- a/vpx_dsp/x86/highbd_sad_avx2.c
+++ b/vpx_dsp/x86/highbd_sad_avx2.c
@@ -50,39 +50,49 @@ static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD64XN(n)                                                    \
-  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 2); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
-                                                                             \
-      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 1;                                                \
-      ref += ref_stride << 1;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
+static VPX_FORCE_INLINE unsigned int highbd_sad64xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < (n / 2); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);
+
+    /* sums_16 will outrange after 2 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
   }
+  return calc_final(sums_32);
+}
 
-// 64x64
-HIGHBD_SAD64XN(64)
+#define HIGHBD_SAD64XN(n)                                                      \
+  unsigned int vpx_highbd_sad64x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad64xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
 
-// 64x32
-HIGHBD_SAD64XN(32)
+#define HIGHBD_SADSKIP64xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_64x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad64xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -107,42 +117,49 @@ static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
   }
 }
 
-#define HIGHBD_SAD32XN(n)                                                    \
-  unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride) {                                                      \
-    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
-    __m256i sums_32 = _mm256_setzero_si256();                                \
-    int i;                                                                   \
-                                                                             \
-    for (i = 0; i < (n / 8); ++i) {                                          \
-      __m256i sums_16 = _mm256_setzero_si256();                              \
-                                                                             \
-      highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
-                                                                             \
-      /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
-       * sums_32*/                                                           \
-      sums_32 = _mm256_add_epi32(                                            \
-          sums_32,                                                           \
-          _mm256_add_epi32(                                                  \
-              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
-              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
-                                                                             \
-      src += src_stride << 3;                                                \
-      ref += ref_stride << 3;                                                \
-    }                                                                        \
-    return calc_final(sums_32);                                              \
-  }
+static VPX_FORCE_INLINE unsigned int highbd_sad32xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
 
-// 32x64
-HIGHBD_SAD32XN(64)
+  for (i = 0; i < (n / 8); ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
 
-// 32x32
-HIGHBD_SAD32XN(32)
+    highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);
 
-// 32x16
-HIGHBD_SAD32XN(16)
+    /* sums_16 will outrange after 8 rows, so add current sums_16 to
+     * sums_32*/
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 3;
+    ref += ref_stride << 3;
+  }
+  return calc_final(sums_32);
+}
+
+#define HIGHBD_SAD32XN(n)                                                      \
+  unsigned int vpx_highbd_sad32x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad32xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP32xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_32x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad32xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
 
 static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
                                             const uint16_t *src, int src_stride,
@@ -167,17 +184,22 @@ static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
   }
 }
 
-unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
-                                      const uint8_t *ref_ptr, int ref_stride) {
+static VPX_FORCE_INLINE unsigned int highbd_sad16xN_avx2(const uint8_t *src_ptr,
+                                                         int src_stride,
+                                                         const uint8_t *ref_ptr,
+                                                         int ref_stride,
+                                                         int n) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
   __m256i sums_32 = _mm256_setzero_si256();
+  const int height = VPXMIN(16, n);
+  const int num_iters = n / height;
   int i;
 
-  for (i = 0; i < 2; ++i) {
+  for (i = 0; i < num_iters; ++i) {
     __m256i sums_16 = _mm256_setzero_si256();
 
-    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, height);
 
     // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
     sums_32 = _mm256_add_epi32(
@@ -192,6 +214,21 @@ unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
   return calc_final(sums_32);
 }
 
+#define HIGHBD_SAD16XN(n)                                                      \
+  unsigned int vpx_highbd_sad16x##n##_avx2(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad16xN_avx2(src, src_stride, ref, ref_stride, n);           \
+  }
+
+#define HIGHBD_SADSKIP16xN(n)                                                \
+  unsigned int vpx_highbd_sad_skip_16x##n##_avx2(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * highbd_sad16xN_avx2(src, 2 * src_stride, ref, 2 * ref_stride, \
+                                   n / 2);                                   \
+  }
+
 unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride) {
   const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
@@ -224,6 +261,23 @@ unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
   }
 }
 
+// clang-format off
+HIGHBD_SAD64XN(64)
+HIGHBD_SADSKIP64xN(64)
+HIGHBD_SAD64XN(32)
+HIGHBD_SADSKIP64xN(32)
+HIGHBD_SAD32XN(64)
+HIGHBD_SADSKIP32xN(64)
+HIGHBD_SAD32XN(32)
+HIGHBD_SADSKIP32xN(32)
+HIGHBD_SAD32XN(16)
+HIGHBD_SADSKIP32xN(16)
+HIGHBD_SAD16XN(32)
+HIGHBD_SADSKIP16xN(32)
+HIGHBD_SADSKIP16xN(16)
+HIGHBD_SADSKIP16xN(8)
+//clang-format on
+
 // AVG -------------------------------------------------------------------------
 static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
                                                 const uint16_t *src,
diff --git a/vpx_dsp/x86/highbd_sad_sse2.asm b/vpx_dsp/x86/highbd_sad_sse2.asm
index 6a1a6f3d6..62ad2237f 100644
--- a/vpx_dsp/x86/highbd_sad_sse2.asm
+++ b/vpx_dsp/x86/highbd_sad_sse2.asm
@@ -12,6 +12,11 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro HIGH_SAD_FN 4
 %if %4 == 0
 %if %3 == 5
@@ -20,7 +25,7 @@ cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
 cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+%elif %4 == 1 ; avg
 %if %3 == 5
 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +40,18 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -54,7 +70,11 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD64XN 1-2 0
   HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -146,6 +166,9 @@ cglobal highbd_sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 7, src, src_stride, \
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -155,13 +178,19 @@ HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
 
 
 ; unsigned int vpx_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD32XN 1-2 0
   HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -213,6 +242,9 @@ HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -224,12 +256,19 @@ HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
 
 ; unsigned int vpx_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD16XN 1-2 0
   HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -281,6 +320,9 @@ HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -292,13 +334,19 @@ HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
-
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
 
 ; unsigned int vpx_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro HIGH_SAD8XN 1-2 0
   HIGH_SAD_FN 8, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
   pxor                  m6, m6
 
@@ -350,6 +398,9 @@ HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
   punpckldq             m0, m6
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -361,3 +412,5 @@ HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
 HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
diff --git a/vpx_dsp/x86/inv_txfm_avx2.c b/vpx_dsp/x86/inv_txfm_avx2.c
new file mode 100644
index 000000000..752435d24
--- /dev/null
+++ b/vpx_dsp/x86/inv_txfm_avx2.c
@@ -0,0 +1,626 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define PAIR256_SET_EPI16(a, b)                                            \
+  _mm256_set_epi16((int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a), \
+                   (int16_t)(b), (int16_t)(a), (int16_t)(b), (int16_t)(a))
+
+static INLINE void idct_load16x16(const tran_low_t *input, __m256i *in,
+                                  int stride) {
+  int i;
+  // Load 16x16 values
+  for (i = 0; i < 16; i++) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m128i in0 = _mm_loadu_si128((const __m128i *)(input + i * stride));
+    const __m128i in1 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 4));
+    const __m128i in2 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 8));
+    const __m128i in3 =
+        _mm_loadu_si128((const __m128i *)((input + i * stride) + 12));
+    const __m128i ls = _mm_packs_epi32(in0, in1);
+    const __m128i rs = _mm_packs_epi32(in2, in3);
+    in[i] = _mm256_inserti128_si256(_mm256_castsi128_si256(ls), rs, 1);
+#else
+    in[i] = _mm256_load_si256((const __m256i *)(input + i * stride));
+#endif
+  }
+}
+
+static INLINE __m256i dct_round_shift_avx2(__m256i in) {
+  const __m256i t = _mm256_add_epi32(in, _mm256_set1_epi32(DCT_CONST_ROUNDING));
+  return _mm256_srai_epi32(t, DCT_CONST_BITS);
+}
+
+static INLINE __m256i idct_madd_round_shift_avx2(__m256i *in, __m256i *cospi) {
+  const __m256i t = _mm256_madd_epi16(*in, *cospi);
+  return dct_round_shift_avx2(t);
+}
+
+// Calculate the dot product between in0/1 and x and wrap to short.
+static INLINE __m256i idct_calc_wraplow_avx2(__m256i *in0, __m256i *in1,
+                                             __m256i *x) {
+  const __m256i t0 = idct_madd_round_shift_avx2(in0, x);
+  const __m256i t1 = idct_madd_round_shift_avx2(in1, x);
+  return _mm256_packs_epi32(t0, t1);
+}
+
+// Multiply elements by constants and add them together.
+static INLINE void butterfly16(__m256i in0, __m256i in1, int c0, int c1,
+                               __m256i *out0, __m256i *out1) {
+  __m256i cst0 = PAIR256_SET_EPI16(c0, -c1);
+  __m256i cst1 = PAIR256_SET_EPI16(c1, c0);
+  __m256i lo = _mm256_unpacklo_epi16(in0, in1);
+  __m256i hi = _mm256_unpackhi_epi16(in0, in1);
+  *out0 = idct_calc_wraplow_avx2(&lo, &hi, &cst0);
+  *out1 = idct_calc_wraplow_avx2(&lo, &hi, &cst1);
+}
+
+static INLINE void idct16_16col(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[1], in[15], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[9], in[7], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[5], in[11], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[13], in[3], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  butterfly16(in[2], in[14], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[10], in[6], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[10], step2[11]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[14], step2[15]);
+
+  // stage 4
+  butterfly16(in[0], in[8], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[4], in[12], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[10], step1[13], -cospi_8_64, -cospi_24_64, &step2[13],
+              &step2[10]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step1[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step1[7] = _mm256_add_epi16(step1[6], step1[7]);
+  step2[8] = step1[8];
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+  step2[15] = step1[15];
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  step2[0] = _mm256_add_epi16(step1[0], step1[7]);
+  step2[1] = _mm256_add_epi16(step1[1], step1[6]);
+  step2[2] = _mm256_add_epi16(step1[2], step1[5]);
+  step2[3] = _mm256_add_epi16(step1[3], step1[4]);
+  step2[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  step2[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  step2[7] = _mm256_sub_epi16(step1[0], step1[7]);
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &step2[10],
+              &step2[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &step2[11],
+              &step2[12]);
+
+  // stage 7
+  out[0] = _mm256_add_epi16(step2[0], step1[15]);
+  out[1] = _mm256_add_epi16(step2[1], step1[14]);
+  out[2] = _mm256_add_epi16(step2[2], step2[13]);
+  out[3] = _mm256_add_epi16(step2[3], step2[12]);
+  out[4] = _mm256_add_epi16(step2[4], step2[11]);
+  out[5] = _mm256_add_epi16(step2[5], step2[10]);
+  out[6] = _mm256_add_epi16(step2[6], step1[9]);
+  out[7] = _mm256_add_epi16(step2[7], step1[8]);
+  out[8] = _mm256_sub_epi16(step2[7], step1[8]);
+  out[9] = _mm256_sub_epi16(step2[6], step1[9]);
+  out[10] = _mm256_sub_epi16(step2[5], step2[10]);
+  out[11] = _mm256_sub_epi16(step2[4], step2[11]);
+  out[12] = _mm256_sub_epi16(step2[3], step2[12]);
+  out[13] = _mm256_sub_epi16(step2[2], step2[13]);
+  out[14] = _mm256_sub_epi16(step2[1], step1[14]);
+  out[15] = _mm256_sub_epi16(step2[0], step1[15]);
+}
+
+static INLINE void recon_and_store16(uint8_t *dest, __m256i in_x) {
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i d0 = _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dest)));
+  d0 = _mm256_permute4x64_epi64(d0, 0xd8);
+  d0 = _mm256_unpacklo_epi8(d0, zero);
+  d0 = _mm256_add_epi16(in_x, d0);
+  d0 = _mm256_packus_epi16(
+      d0, _mm256_castsi128_si256(_mm256_extractf128_si256(d0, 1)));
+
+  _mm_storeu_si128((__m128i *)dest, _mm256_castsi256_si128(d0));
+}
+
+static INLINE void write_buffer_16x1(uint8_t *dest, __m256i in) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  __m256i out;
+  out = _mm256_adds_epi16(in, final_rounding);
+  out = _mm256_srai_epi16(out, 6);
+  recon_and_store16(dest, out);
+}
+
+static INLINE void store_buffer_16x32(__m256i *in, uint8_t *dst, int stride) {
+  const __m256i final_rounding = _mm256_set1_epi16(1 << 5);
+  int j = 0;
+  while (j < 32) {
+    in[j] = _mm256_adds_epi16(in[j], final_rounding);
+    in[j + 1] = _mm256_adds_epi16(in[j + 1], final_rounding);
+
+    in[j] = _mm256_srai_epi16(in[j], 6);
+    in[j + 1] = _mm256_srai_epi16(in[j + 1], 6);
+
+    recon_and_store16(dst, in[j]);
+    dst += stride;
+    recon_and_store16(dst, in[j + 1]);
+    dst += stride;
+    j += 2;
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(__m256i *in, __m256i *out) {
+  int i;
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(__m256i *in, __m256i *out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[(idx) + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + (idx)] =                                                             \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + (idx)] = _mm256_inserti128_si256(                                    \
+      t[8 + (idx)], _mm_load_si128((__m128i const *)&in[(idx) + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+void vpx_idct16x16_256_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  int i;
+  __m256i in[16];
+
+  // Load 16x16 values
+  idct_load16x16(input, in, 16);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  transpose_16bit_16x16_avx2(in, in);
+  idct16_16col(in, in);
+
+  for (i = 0; i < 16; ++i) {
+    write_buffer_16x1(dest + i * stride, in[i]);
+  }
+}
+
+// Only do addition and subtraction butterfly, size = 16, 32
+static INLINE void add_sub_butterfly_avx2(__m256i *in, __m256i *out, int size) {
+  int i = 0;
+  const int num = size >> 1;
+  const int bound = size - 1;
+  while (i < num) {
+    out[i] = _mm256_add_epi16(in[i], in[bound - i]);
+    out[bound - i] = _mm256_sub_epi16(in[i], in[bound - i]);
+    i++;
+  }
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+// output pixels: 0-7 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_1(__m256i *in, __m256i *out) {
+  __m256i step1[8], step2[8];
+
+  // stage 3
+  butterfly16(in[4], in[28], cospi_28_64, cospi_4_64, &step1[4], &step1[7]);
+  butterfly16(in[20], in[12], cospi_12_64, cospi_20_64, &step1[5], &step1[6]);
+
+  // stage 4
+  butterfly16(in[0], in[16], cospi_16_64, cospi_16_64, &step2[1], &step2[0]);
+  butterfly16(in[8], in[24], cospi_24_64, cospi_8_64, &step2[2], &step2[3]);
+  step2[4] = _mm256_add_epi16(step1[4], step1[5]);
+  step2[5] = _mm256_sub_epi16(step1[4], step1[5]);
+  step2[6] = _mm256_sub_epi16(step1[7], step1[6]);
+  step2[7] = _mm256_add_epi16(step1[7], step1[6]);
+
+  // stage 5
+  step1[0] = _mm256_add_epi16(step2[0], step2[3]);
+  step1[1] = _mm256_add_epi16(step2[1], step2[2]);
+  step1[2] = _mm256_sub_epi16(step2[1], step2[2]);
+  step1[3] = _mm256_sub_epi16(step2[0], step2[3]);
+  step1[4] = step2[4];
+  butterfly16(step2[6], step2[5], cospi_16_64, cospi_16_64, &step1[5],
+              &step1[6]);
+  step1[7] = step2[7];
+
+  // stage 6
+  out[0] = _mm256_add_epi16(step1[0], step1[7]);
+  out[1] = _mm256_add_epi16(step1[1], step1[6]);
+  out[2] = _mm256_add_epi16(step1[2], step1[5]);
+  out[3] = _mm256_add_epi16(step1[3], step1[4]);
+  out[4] = _mm256_sub_epi16(step1[3], step1[4]);
+  out[5] = _mm256_sub_epi16(step1[2], step1[5]);
+  out[6] = _mm256_sub_epi16(step1[1], step1[6]);
+  out[7] = _mm256_sub_epi16(step1[0], step1[7]);
+}
+
+static INLINE void idct32_16x32_quarter_2_stage_4_to_6(__m256i *step1,
+                                                       __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  butterfly16(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9],
+              &step2[14]);
+  butterfly16(step1[13], step1[10], -cospi_8_64, cospi_24_64, &step2[10],
+              &step2[13]);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[8] = _mm256_add_epi16(step2[8], step2[11]);
+  step1[9] = _mm256_add_epi16(step2[9], step2[10]);
+  step1[10] = _mm256_sub_epi16(step2[9], step2[10]);
+  step1[11] = _mm256_sub_epi16(step2[8], step2[11]);
+  step1[12] = _mm256_sub_epi16(step2[15], step2[12]);
+  step1[13] = _mm256_sub_epi16(step2[14], step2[13]);
+  step1[14] = _mm256_add_epi16(step2[14], step2[13]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[12]);
+
+  // stage 6
+  out[8] = step1[8];
+  out[9] = step1[9];
+  butterfly16(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10],
+              &out[13]);
+  butterfly16(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11],
+              &out[12]);
+  out[14] = step1[14];
+  out[15] = step1[15];
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+// output pixels: 8-15 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_2(__m256i *in, __m256i *out) {
+  __m256i step1[16], step2[16];
+
+  // stage 2
+  butterfly16(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]);
+  butterfly16(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]);
+  butterfly16(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]);
+  butterfly16(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]);
+
+  // stage 3
+  step1[8] = _mm256_add_epi16(step2[8], step2[9]);
+  step1[9] = _mm256_sub_epi16(step2[8], step2[9]);
+  step1[10] = _mm256_sub_epi16(step2[11], step2[10]);
+  step1[11] = _mm256_add_epi16(step2[11], step2[10]);
+  step1[12] = _mm256_add_epi16(step2[12], step2[13]);
+  step1[13] = _mm256_sub_epi16(step2[12], step2[13]);
+  step1[14] = _mm256_sub_epi16(step2[15], step2[14]);
+  step1[15] = _mm256_add_epi16(step2[15], step2[14]);
+
+  idct32_16x32_quarter_2_stage_4_to_6(step1, out);
+}
+
+static INLINE void idct32_16x32_quarter_3_4_stage_4_to_7(__m256i *step1,
+                                                         __m256i *out) {
+  __m256i step2[32];
+
+  // stage 4
+  step2[16] = _mm256_add_epi16(step1[16], step1[19]);
+  step2[17] = _mm256_add_epi16(step1[17], step1[18]);
+  step2[18] = _mm256_sub_epi16(step1[17], step1[18]);
+  step2[19] = _mm256_sub_epi16(step1[16], step1[19]);
+  step2[20] = _mm256_sub_epi16(step1[23], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[22], step1[21]);
+  step2[22] = _mm256_add_epi16(step1[22], step1[21]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[20]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[27]);
+  step2[25] = _mm256_add_epi16(step1[25], step1[26]);
+  step2[26] = _mm256_sub_epi16(step1[25], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[24], step1[27]);
+  step2[28] = _mm256_sub_epi16(step1[31], step1[28]);
+  step2[29] = _mm256_sub_epi16(step1[30], step1[29]);
+  step2[30] = _mm256_add_epi16(step1[29], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[28], step1[31]);
+
+  // stage 5
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  butterfly16(step2[29], step2[18], cospi_24_64, cospi_8_64, &step1[18],
+              &step1[29]);
+  butterfly16(step2[28], step2[19], cospi_24_64, cospi_8_64, &step1[19],
+              &step1[28]);
+  butterfly16(step2[27], step2[20], -cospi_8_64, cospi_24_64, &step1[20],
+              &step1[27]);
+  butterfly16(step2[26], step2[21], -cospi_8_64, cospi_24_64, &step1[21],
+              &step1[26]);
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  out[16] = _mm256_add_epi16(step1[16], step1[23]);
+  out[17] = _mm256_add_epi16(step1[17], step1[22]);
+  out[18] = _mm256_add_epi16(step1[18], step1[21]);
+  out[19] = _mm256_add_epi16(step1[19], step1[20]);
+  step2[20] = _mm256_sub_epi16(step1[19], step1[20]);
+  step2[21] = _mm256_sub_epi16(step1[18], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[17], step1[22]);
+  step2[23] = _mm256_sub_epi16(step1[16], step1[23]);
+
+  step2[24] = _mm256_sub_epi16(step1[31], step1[24]);
+  step2[25] = _mm256_sub_epi16(step1[30], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[29], step1[26]);
+  step2[27] = _mm256_sub_epi16(step1[28], step1[27]);
+  out[28] = _mm256_add_epi16(step1[27], step1[28]);
+  out[29] = _mm256_add_epi16(step1[26], step1[29]);
+  out[30] = _mm256_add_epi16(step1[25], step1[30]);
+  out[31] = _mm256_add_epi16(step1[24], step1[31]);
+
+  // stage 7
+  butterfly16(step2[27], step2[20], cospi_16_64, cospi_16_64, &out[20],
+              &out[27]);
+  butterfly16(step2[26], step2[21], cospi_16_64, cospi_16_64, &out[21],
+              &out[26]);
+  butterfly16(step2[25], step2[22], cospi_16_64, cospi_16_64, &out[22],
+              &out[25]);
+  butterfly16(step2[24], step2[23], cospi_16_64, cospi_16_64, &out[23],
+              &out[24]);
+}
+
+static INLINE void idct32_1024_16x32_quarter_1_2(__m256i *in, __m256i *out) {
+  __m256i temp[16];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  idct32_1024_16x32_quarter_1(in, temp);
+
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_2(in, temp);
+
+  // stage 7
+  add_sub_butterfly_avx2(temp, out, 16);
+}
+
+// For each 16x32 block __m256i in[32],
+// Input with odd index,
+// 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+// output pixels: 16-23, 24-31 in __m256i out[32]
+static INLINE void idct32_1024_16x32_quarter_3_4(__m256i *in, __m256i *out) {
+  __m256i step1[32], step2[32];
+
+  // stage 1
+  butterfly16(in[1], in[31], cospi_31_64, cospi_1_64, &step1[16], &step1[31]);
+  butterfly16(in[17], in[15], cospi_15_64, cospi_17_64, &step1[17], &step1[30]);
+  butterfly16(in[9], in[23], cospi_23_64, cospi_9_64, &step1[18], &step1[29]);
+  butterfly16(in[25], in[7], cospi_7_64, cospi_25_64, &step1[19], &step1[28]);
+
+  butterfly16(in[5], in[27], cospi_27_64, cospi_5_64, &step1[20], &step1[27]);
+  butterfly16(in[21], in[11], cospi_11_64, cospi_21_64, &step1[21], &step1[26]);
+
+  butterfly16(in[13], in[19], cospi_19_64, cospi_13_64, &step1[22], &step1[25]);
+  butterfly16(in[29], in[3], cospi_3_64, cospi_29_64, &step1[23], &step1[24]);
+
+  // stage 2
+  step2[16] = _mm256_add_epi16(step1[16], step1[17]);
+  step2[17] = _mm256_sub_epi16(step1[16], step1[17]);
+  step2[18] = _mm256_sub_epi16(step1[19], step1[18]);
+  step2[19] = _mm256_add_epi16(step1[19], step1[18]);
+  step2[20] = _mm256_add_epi16(step1[20], step1[21]);
+  step2[21] = _mm256_sub_epi16(step1[20], step1[21]);
+  step2[22] = _mm256_sub_epi16(step1[23], step1[22]);
+  step2[23] = _mm256_add_epi16(step1[23], step1[22]);
+
+  step2[24] = _mm256_add_epi16(step1[24], step1[25]);
+  step2[25] = _mm256_sub_epi16(step1[24], step1[25]);
+  step2[26] = _mm256_sub_epi16(step1[27], step1[26]);
+  step2[27] = _mm256_add_epi16(step1[27], step1[26]);
+  step2[28] = _mm256_add_epi16(step1[28], step1[29]);
+  step2[29] = _mm256_sub_epi16(step1[28], step1[29]);
+  step2[30] = _mm256_sub_epi16(step1[31], step1[30]);
+  step2[31] = _mm256_add_epi16(step1[31], step1[30]);
+
+  // stage 3
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  butterfly16(step2[30], step2[17], cospi_28_64, cospi_4_64, &step1[17],
+              &step1[30]);
+  butterfly16(step2[29], step2[18], -cospi_4_64, cospi_28_64, &step1[18],
+              &step1[29]);
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  butterfly16(step2[26], step2[21], cospi_12_64, cospi_20_64, &step1[21],
+              &step1[26]);
+  butterfly16(step2[25], step2[22], -cospi_20_64, cospi_12_64, &step1[22],
+              &step1[25]);
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  idct32_16x32_quarter_3_4_stage_4_to_7(step1, out);
+}
+
+static INLINE void idct32_1024_16x32(__m256i *in, __m256i *out) {
+  __m256i temp[32];
+
+  // For each 16x32 block __m256i in[32],
+  // Input with index, 0, 4, 8, 12, 16, 20, 24, 28
+  // output pixels: 0-7 in __m256i out[32]
+  // AND
+  // Input with index, 2, 6, 10, 14, 18, 22, 26, 30
+  // output pixels: 8-15 in __m256i out[32]
+  idct32_1024_16x32_quarter_1_2(in, temp);
+
+  // For each 16x32 block __m256i in[32],
+  // Input with odd index,
+  // 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31
+  // output pixels: 16-23, 24-31 in __m256i out[32]
+  idct32_1024_16x32_quarter_3_4(in, temp);
+
+  // final stage
+  add_sub_butterfly_avx2(temp, out, 32);
+}
+
+void vpx_idct32x32_1024_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                 int stride) {
+  __m256i l[32], r[32], out[32], *in;
+  int i;
+
+  in = l;
+
+  for (i = 0; i < 2; i++) {
+    idct_load16x16(input, in, 32);
+    transpose_16bit_16x16_avx2(in, in);
+
+    idct_load16x16(input + 16, in + 16, 32);
+    transpose_16bit_16x16_avx2(in + 16, in + 16);
+    idct32_1024_16x32(in, in);
+
+    in = r;
+    input += 32 << 4;
+  }
+
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(l + i, out);
+    transpose_16bit_16x16_avx2(r + i, out + 16);
+    idct32_1024_16x32(out, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
+
+// Case when only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_avx2(const tran_low_t *input, uint8_t *dest,
+                                int stride) {
+  __m256i in[32], io[32], out[32];
+  int i;
+
+  for (i = 16; i < 32; i++) {
+    in[i] = _mm256_setzero_si256();
+  }
+
+  // rows
+  idct_load16x16(input, in, 32);
+  transpose_16bit_16x16_avx2(in, in);
+  idct32_1024_16x32(in, io);
+
+  // columns
+  for (i = 0; i < 32; i += 16) {
+    transpose_16bit_16x16_avx2(io + i, in);
+    idct32_1024_16x32(in, out);
+
+    store_buffer_16x32(out, dest, stride);
+    dest += 16;
+  }
+}
diff --git a/vpx_dsp/x86/quantize_avx.c b/vpx_dsp/x86/quantize_avx.c
index 7d8352721..5ff5abc11 100644
--- a/vpx_dsp/x86/quantize_avx.c
+++ b/vpx_dsp/x86/quantize_avx.c
@@ -19,17 +19,18 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        const int16_t *zbin_ptr, const int16_t *round_ptr,
-                        const int16_t *quant_ptr,
-                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan) {
+                        const struct macroblock_plane *const mb_plane,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -38,12 +39,9 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-
   *eob_ptr = 0;
 
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -140,17 +138,15 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              const int16_t *zbin_ptr, const int16_t *round_ptr,
-                              const int16_t *quant_ptr,
-                              const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr,
+                              const struct macroblock_plane *const mb_plane,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                              const int16_t *scan, const int16_t *iscan) {
+                              const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   const __m256i big_zero = _mm256_setzero_si256();
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -159,27 +155,8 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  shift = _mm_slli_epi16(shift, 1);
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_avx2.c b/vpx_dsp/x86/quantize_avx2.c
index 28f7c9c7d..d4872f6bc 100644
--- a/vpx_dsp/x86/quantize_avx2.c
+++ b/vpx_dsp/x86/quantize_avx2.c
@@ -13,13 +13,15 @@
 
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 static VPX_FORCE_INLINE void load_b_values_avx2(
-    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
-    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
-    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    const struct macroblock_plane *mb_plane, __m256i *zbin, __m256i *round,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant,
     __m256i *shift, int log_scale) {
-  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->zbin));
   *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
   if (log_scale > 0) {
     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -30,7 +32,8 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
   // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
   *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
 
-  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->round));
   *round = _mm256_permute4x64_epi64(*round, 0x54);
   if (log_scale > 0) {
     const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
@@ -38,12 +41,14 @@ static VPX_FORCE_INLINE void load_b_values_avx2(
     *round = _mm256_srai_epi16(*round, log_scale);
   }
 
-  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)mb_plane->quant));
   *quant = _mm256_permute4x64_epi64(*quant, 0x54);
   *dequant =
       _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
   *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
-  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_castsi128_si256(
+      _mm_load_si128((const __m128i *)mb_plane->quant_shift));
   *shift = _mm256_permute4x64_epi64(*shift, 0x54);
 }
 
@@ -151,20 +156,17 @@ static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
 }
 
 void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 0);
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 0);
   // Do DC and first 15 AC.
   v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
                             &v_dequant, &v_round, &v_zbin, &v_quant_shift);
@@ -250,23 +252,18 @@ static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
   }
 }
 
-void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               const int16_t *zbin_ptr,
-                               const int16_t *round_ptr,
-                               const int16_t *quant_ptr,
-                               const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr,
+                               const struct macroblock_plane *const mb_plane,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                               const int16_t *scan, const int16_t *iscan) {
+                               const struct ScanOrder *const scan_order) {
   __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
   __m256i v_eobmax = _mm256_setzero_si256();
   intptr_t count;
-  (void)n_coeffs;
-  (void)scan;
+  const int16_t *iscan = scan_order->iscan;
 
-  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
-                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
-                     &v_quant_shift, 1);
+  load_b_values_avx2(mb_plane, &v_zbin, &v_round, &v_quant, dequant_ptr,
+                     &v_dequant, &v_quant_shift, 1);
 
   // Do DC and first 15 AC.
   v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
diff --git a/vpx_dsp/x86/quantize_sse2.c b/vpx_dsp/x86/quantize_sse2.c
index 9533e7916..64838eaa7 100644
--- a/vpx_dsp/x86/quantize_sse2.c
+++ b/vpx_dsp/x86/quantize_sse2.c
@@ -16,16 +16,16 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
+#include "vp9/common/vp9_scan.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const struct macroblock_plane *const mb_plane,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
@@ -33,11 +33,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
   // Setup global values.
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/quantize_sse2.h b/vpx_dsp/x86/quantize_sse2.h
index 27bfb4e41..82c755a0c 100644
--- a/vpx_dsp/x86/quantize_sse2.h
+++ b/vpx_dsp/x86/quantize_sse2.h
@@ -15,26 +15,53 @@
 
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
+#include "vp9/encoder/vp9_block.h"
 
-static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
-                                 const int16_t *round_ptr, __m128i *round,
-                                 const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values(const struct macroblock_plane *const mb_plane,
+                                 __m128i *zbin, __m128i *round, __m128i *quant,
                                  const int16_t *dequant_ptr, __m128i *dequant,
-                                 const int16_t *shift_ptr, __m128i *shift) {
-  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+                                 __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
   *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
 }
 
-static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
-                                  const int16_t *quant_ptr, __m128i *quant,
+static INLINE void load_b_values32x32(
+    const struct macroblock_plane *const mb_plane, __m128i *zbin,
+    __m128i *round, __m128i *quant, const int16_t *dequant_ptr,
+    __m128i *dequant, __m128i *shift) {
+  const __m128i one = _mm_set1_epi16(1);
+  // The 32x32 halves zbin and round.
+  *zbin = _mm_load_si128((const __m128i *)mb_plane->zbin);
+  // Shift with rounding.
+  *zbin = _mm_add_epi16(*zbin, one);
+  *zbin = _mm_srli_epi16(*zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  *zbin = _mm_sub_epi16(*zbin, one);
+
+  *round = _mm_load_si128((const __m128i *)mb_plane->round);
+  *round = _mm_add_epi16(*round, one);
+  *round = _mm_srli_epi16(*round, 1);
+
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)mb_plane->quant_shift);
+  // I suspect this is not technically OK because quant_shift can be up
+  // to 1 << 16 and shifting up again will outrange that, but the test is not
+  // comprehensive enough to catch that and "it's been that way forever"
+  *shift = _mm_slli_epi16(*shift, 1);
+}
+
+static INLINE void load_fp_values(const struct macroblock_plane *mb_plane,
+                                  __m128i *round, __m128i *quant,
                                   const int16_t *dequant_ptr,
                                   __m128i *dequant) {
-  *round = _mm_load_si128((const __m128i *)round_ptr);
-  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *round = _mm_load_si128((const __m128i *)mb_plane->round_fp);
+  *quant = _mm_load_si128((const __m128i *)mb_plane->quant_fp);
   *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
 }
 
diff --git a/vpx_dsp/x86/quantize_ssse3.c b/vpx_dsp/x86/quantize_ssse3.c
index 476230286..2c6d851a1 100644
--- a/vpx_dsp/x86/quantize_ssse3.c
+++ b/vpx_dsp/x86/quantize_ssse3.c
@@ -16,16 +16,17 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 #include "vpx_dsp/x86/quantize_sse2.h"
 #include "vpx_dsp/x86/quantize_ssse3.h"
+#include "vp9/common/vp9_scan.h"
+#include "vp9/encoder/vp9_block.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          const int16_t *zbin_ptr, const int16_t *round_ptr,
-                          const int16_t *quant_ptr,
-                          const int16_t *quant_shift_ptr,
+                          const struct macroblock_plane *const mb_plane,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                          const int16_t *scan, const int16_t *iscan) {
+                          const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
   int index = 16;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -33,10 +34,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i cmp_mask0, cmp_mask1;
   __m128i eob, eob0;
 
-  (void)scan;
-
-  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
-                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+  load_b_values(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant, &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
@@ -107,17 +105,14 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   *eob_ptr = accumulate_eob(eob);
 }
 
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr,
+                                const struct macroblock_plane *const mb_plane,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
+                                const struct ScanOrder *const scan_order) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i one = _mm_set1_epi16(1);
   int index;
+  const int16_t *iscan = scan_order->iscan;
 
   __m128i zbin, round, quant, dequant, shift;
   __m128i coeff0, coeff1;
@@ -126,30 +121,8 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   __m128i all_zero;
   __m128i eob = zero, eob0;
 
-  (void)scan;
-  (void)n_coeffs;
-
-  // Setup global values.
-  // The 32x32 halves zbin and round.
-  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
-  // Shift with rounding.
-  zbin = _mm_add_epi16(zbin, one);
-  zbin = _mm_srli_epi16(zbin, 1);
-  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
-  // it is a strict "greater" comparison.
-  zbin = _mm_sub_epi16(zbin, one);
-
-  round = _mm_load_si128((const __m128i *)round_ptr);
-  round = _mm_add_epi16(round, one);
-  round = _mm_srli_epi16(round, 1);
-
-  quant = _mm_load_si128((const __m128i *)quant_ptr);
-  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
-  // I suspect this is not technically OK because quant_shift can be up
-  // to 1 << 16 and shifting up again will outrange that, but the test is not
-  // comprehensive enough to catch that and "it's been that way forever"
-  shift = _mm_slli_epi16(shift, 1);
+  load_b_values32x32(mb_plane, &zbin, &round, &quant, dequant_ptr, &dequant,
+                     &shift);
 
   // Do DC and first 15 AC.
   coeff0 = load_tran_low(coeff_ptr);
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 399b67b3f..cf7111983 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -25,9 +25,10 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
   _mm_storeu_si128((__m128i *)sad_array, sum);
 }
 
-void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad32xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   int i;
   const uint8_t *refs[4];
   __m256i sums[4];
@@ -41,7 +42,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 32; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r[4];
 
     // load src and all ref[]
@@ -73,9 +74,10 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
-                          const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t sad_array[4]) {
+static INLINE void sad64xhx4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *const ref_array[4],
+                                   int ref_stride, int h,
+                                   uint32_t sad_array[4]) {
   __m256i sums[4];
   int i;
   const uint8_t *refs[4];
@@ -89,7 +91,7 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
   sums[2] = _mm256_setzero_si256();
   sums[3] = _mm256_setzero_si256();
 
-  for (i = 0; i < 64; i++) {
+  for (i = 0; i < h; i++) {
     __m256i r_lo[4], r_hi[4];
     // load 64 bytes from src and all ref[]
     const __m256i s_lo = _mm256_load_si256((const __m256i *)src_ptr);
@@ -132,3 +134,51 @@ void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
 
   calc_final_4(sums, sad_array);
 }
+
+#define SAD64_H(h)                                                         \
+  void vpx_sad64x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad64xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+#define SAD32_H(h)                                                         \
+  void vpx_sad32x##h##x4d_avx2(const uint8_t *src, int src_stride,         \
+                               const uint8_t *const ref_array[4],          \
+                               int ref_stride, uint32_t sad_array[4]) {    \
+    sad32xhx4d_avx2(src, src_stride, ref_array, ref_stride, h, sad_array); \
+  }
+
+SAD64_H(64)
+SAD32_H(32)
+
+#define SADS64_H(h)                                                           \
+  void vpx_sad_skip_64x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad64xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+#define SADS32_H(h)                                                           \
+  void vpx_sad_skip_32x##h##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    sad32xhx4d_avx2(src, 2 * src_stride, ref_array, 2 * ref_stride,           \
+                    ((h) >> 1), sad_array);                                   \
+    sad_array[0] <<= 1;                                                       \
+    sad_array[1] <<= 1;                                                       \
+    sad_array[2] <<= 1;                                                       \
+    sad_array[3] <<= 1;                                                       \
+  }
+
+SADS64_H(64)
+SADS64_H(32)
+
+SADS32_H(64)
+SADS32_H(32)
+SADS32_H(16)
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55ce9..ed4ea3ef9 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -179,7 +179,16 @@ SECTION .text
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
 ; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16, 8x8, 8x4, 4x8 and 4x4
-%macro SADNXN4D 2
+%macro SADNXN4D 2-3 0
+%if %3 == 1  ; skip rows
+%if UNIX64
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif
+%else  ; normal sad
 %if UNIX64
 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
                               res, ref2, ref3, ref4
@@ -187,6 +196,11 @@ cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
                               ref2, ref3, ref4
 %endif
+%endif
+%if %3 == 1
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
   mov                ref2q, [ref1q+gprsize*1]
@@ -195,9 +209,15 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   mov                ref1q, [ref1q+gprsize*0]
 
   PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
-%rep (%2-4)/2
+%if %3 == 1  ; downsample number of rows by 2
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
 %endrep
+%undef num_rep
   PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
 
 %if %1 > 4
@@ -211,12 +231,19 @@ cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
   punpckhqdq            m5, m7
   movifnidn             r4, r4mp
   paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
   movu                [r4], m4
   RET
 %else
   movifnidn             r4, r4mp
   pshufd            m6, m6, 0x08
   pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
   movq              [r4+0], m6
   movq              [r4+8], m7
   RET
@@ -237,3 +264,15 @@ SADNXN4D  8,  8
 SADNXN4D  8,  4
 SADNXN4D  4,  8
 SADNXN4D  4,  4
+
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16,  8, 1
+SADNXN4D  8, 16, 1
+SADNXN4D  8,  8, 1
+SADNXN4D  4,  8, 1
diff --git a/vpx_dsp/x86/sad_avx2.c b/vpx_dsp/x86/sad_avx2.c
index 29bedb0e6..e00494d76 100644
--- a/vpx_dsp/x86/sad_avx2.c
+++ b/vpx_dsp/x86/sad_avx2.c
@@ -11,73 +11,104 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i, res;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  const int ref2_stride = ref_stride << 1;
+  const int src2_stride = src_stride << 1;
+  const int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  res = _mm_cvtsi128_si32(sum_sad128);
+  return res;
+}
+
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i;                                                                    \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    for (i = 0; i < h; i++) {                                                 \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref_stride;                                                  \
-      src_ptr += src_stride;                                                  \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int vpx_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
 #define FSAD32_H(h)                                                           \
   unsigned int vpx_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
-    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
-    __m256i sum_sad = _mm256_setzero_si256();                                 \
-    __m256i sum_sad_h;                                                        \
-    __m128i sum_sad128;                                                       \
-    int ref2_stride = ref_stride << 1;                                        \
-    int src2_stride = src_stride << 1;                                        \
-    int max = h >> 1;                                                         \
-    for (i = 0; i < max; i++) {                                               \
-      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
-      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
-      sad1_reg = _mm256_sad_epu8(                                             \
-          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
-      sad2_reg = _mm256_sad_epu8(                                             \
-          ref2_reg,                                                           \
-          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
-      sum_sad =                                                               \
-          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
-      ref_ptr += ref2_stride;                                                 \
-      src_ptr += src2_stride;                                                 \
-    }                                                                         \
-    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
-    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
-    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
-    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int vpx_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
   }
 
-#define FSAD64 \
-  FSAD64_H(64) \
-  FSAD64_H(32)
+#define FSAD64  \
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
 
-#define FSAD32 \
-  FSAD32_H(64) \
-  FSAD32_H(32) \
-  FSAD32_H(16)
+#define FSAD32  \
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
 
 FSAD64
 FSAD32
@@ -86,6 +117,8 @@ FSAD32
 #undef FSAD32
 #undef FSAD64_H
 #undef FSAD32_H
+#undef FSADS64_H
+#undef FSADS32_H
 
 #define FSADAVG64_H(h)                                                        \
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index e4e1bc3e9..627e463bf 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -12,15 +12,29 @@
 
 SECTION .text
 
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
 %macro SAD_FN 4
-%if %4 == 0
+%if %4 == 0 ; normal sad
 %if %3 == 5
 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
 %else ; %3 == 7
 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
                             src_stride3, ref_stride3, n_rows
 %endif ; %3 == 5/7
-%else ; avg
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
 %if %3 == 5
 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
                                     second_pred, n_rows
@@ -35,7 +49,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 %define n_rowsd dword r0m
 %endif ; x86-32/64
 %endif ; %3 == 5/7
-%endif ; avg/sad
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
   movsxdifnidn src_strideq, src_strided
   movsxdifnidn ref_strideq, ref_strided
 %if %3 == 7
@@ -48,7 +66,11 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
   SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
   mov              n_rowsd, %1
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -77,6 +99,9 @@ cglobal sad%1x%2_avg, 5, VPX_ARCH_X86_64 + %3, 6, src, src_stride, \
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -86,12 +111,18 @@ SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
 SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
 
 ; unsigned int vpx_sad32x32_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD32XN 1-2 0
   SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
   mov              n_rowsd, %1/2
+%endif
   pxor                  m0, m0
 .loop:
   movu                  m1, [refq]
@@ -120,6 +151,9 @@ SAD64XN 32, 1 ; sad64x32_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -131,12 +165,19 @@ SAD32XN 16 ; sad32x16_sse2
 SAD32XN 64, 1 ; sad32x64_avg_sse2
 SAD32XN 32, 1 ; sad32x32_avg_sse2
 SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
 
 ; unsigned int vpx_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                    uint8_t *ref, int ref_stride);
 %macro SAD16XN 1-2 0
   SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -166,6 +207,9 @@ SAD32XN 16, 1 ; sad32x16_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -177,12 +221,19 @@ SAD16XN  8 ; sad16x8_sse2
 SAD16XN 32, 1 ; sad16x32_avg_sse2
 SAD16XN 16, 1 ; sad16x16_avg_sse2
 SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
 
 ; unsigned int vpx_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD8XN 1-2 0
   SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -210,6 +261,9 @@ SAD16XN  8, 1 ; sad16x8_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -221,12 +275,18 @@ SAD8XN  4 ; sad8x4_sse2
 SAD8XN 16, 1 ; sad8x16_avg_sse2
 SAD8XN  8, 1 ; sad8x8_avg_sse2
 SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
 
 ; unsigned int vpx_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
 ;                                   uint8_t *ref, int ref_stride);
 %macro SAD4XN 1-2 0
   SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
   mov              n_rowsd, %1/4
+%endif
   pxor                  m0, m0
 
 .loop:
@@ -257,6 +317,9 @@ SAD8XN  4, 1 ; sad8x4_avg_sse2
 
   movhlps               m1, m0
   paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
   movd                 eax, m0
   RET
 %endmacro
@@ -266,3 +329,4 @@ SAD4XN  8 ; sad4x8_sse
 SAD4XN  4 ; sad4x4_sse
 SAD4XN  8, 1 ; sad4x8_avg_sse
 SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/vpx_dsp/x86/sse_avx2.c b/vpx_dsp/x86/sse_avx2.c
new file mode 100644
index 000000000..917ff0ef1
--- /dev/null
+++ b/vpx_dsp/x86/sse_avx2.c
@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b0 = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_a2 = load_unaligned_u32(a + a_stride * 2);
+  const __m128i v_a3 = load_unaligned_u32(a + a_stride * 3);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_b2 = load_unaligned_u32(b + b_stride * 2);
+  const __m128i v_b3 = load_unaligned_u32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+        const __m128i v_a1 = _mm_loadu_si128((const __m128i *)(a + a_stride));
+        const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+        const __m128i v_b1 = _mm_loadu_si128((const __m128i *)(b + b_stride));
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            const uint8_t *a2;
+            const uint8_t *b2;
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i v_b_w = _mm256_loadu_si256((const __m256i *)b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_a2 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 2));
+  const __m128i v_a3 = _mm_loadl_epi64((const __m128i *)(a + a_stride * 3));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_b2 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 2));
+  const __m128i v_b3 = _mm_loadl_epi64((const __m128i *)(b + b_stride * 3));
+  const __m128i v_a_hi = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_a_lo = _mm_unpacklo_epi64(v_a2, v_a3);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_b_lo = _mm_unpacklo_epi64(v_b2, v_b3);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a_hi = _mm_loadu_si128((const __m128i *)(a + a_stride));
+  const __m128i v_a_lo = _mm_loadu_si128((const __m128i *)a);
+  const __m256i v_a_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_a_lo), v_a_hi, 1);
+  const __m128i v_b_hi = _mm_loadu_si128((const __m128i *)(b + b_stride));
+  const __m128i v_b_lo = _mm_loadu_si128((const __m128i *)b);
+  const __m256i v_b_w =
+      _mm256_insertf128_si256(_mm256_castsi128_si256(v_b_lo), v_b_hi, 1);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            const uint16_t *a2;
+            const uint16_t *b2;
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            a2 = a + i + (a_stride << 1);
+            b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sse_sse4.c b/vpx_dsp/x86/sse_sse4.c
new file mode 100644
index 000000000..4a7585c57
--- /dev/null
+++ b/vpx_dsp/x86/sse_sse4.c
@@ -0,0 +1,312 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  _mm_storel_epi64((__m128i *)&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = load_unaligned_u32(a);
+  const __m128i v_a1 = load_unaligned_u32(a + a_stride);
+  const __m128i v_b0 = load_unaligned_u32(b);
+  const __m128i v_b1 = load_unaligned_u32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = _mm_loadl_epi64((const __m128i *)a);
+  const __m128i v_a1 = _mm_loadl_epi64((const __m128i *)(a + a_stride));
+  const __m128i v_b0 = _mm_loadl_epi64((const __m128i *)b);
+  const __m128i v_b1 = _mm_loadl_epi64((const __m128i *)(b + b_stride));
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = _mm_loadu_si128((const __m128i *)a);
+  const __m128i v_b_w = _mm_loadu_si128((const __m128i *)b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t vpx_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      _mm_storel_epi64((__m128i *)&sse,
+                       _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb85..e3055ab29 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -124,4 +124,5 @@ INIT_MMX
   lea                predq, [predq+pred_str*2]
   sub                rowsd, 2
   jg .loop_4
+  emms
   RET
diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 35925d590..8305b9f20 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c
@@ -98,6 +98,41 @@ static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
   return _mm256_add_epi32(sum_lo, sum_hi);
 }
 
+static INLINE void variance8_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  __m128i src0, src1, ref0, ref1;
+  __m256i ss, rr, diff;
+
+  // 0 0 0.... 0 s07 s06 s05 s04 s03 s02 s01 s00
+  src0 = _mm_loadl_epi64((const __m128i *)(src + 0 * src_stride));
+
+  // 0 0 0.... 0 s17 s16 s15 s14 s13 s12 s11 s10
+  src1 = _mm_loadl_epi64((const __m128i *)(src + 1 * src_stride));
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (8bit)
+  src0 = _mm_unpacklo_epi64(src0, src1);
+
+  // s17 s16...s11 s10 s07 s06...s01 s00 (16 bit)
+  ss = _mm256_cvtepu8_epi16(src0);
+
+  // 0 0 0.... 0 r07 r06 r05 r04 r03 r02 r01 r00
+  ref0 = _mm_loadl_epi64((const __m128i *)(ref + 0 * ref_stride));
+
+  // 0 0 0.... 0 r17 r16 0 r15 0 r14 0 r13 0 r12 0 r11 0 r10
+  ref1 = _mm_loadl_epi64((const __m128i *)(ref + 1 * ref_stride));
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (8 bit)
+  ref0 = _mm_unpacklo_epi64(ref0, ref1);
+
+  // r17 r16...r11 r10 r07 r06...r01 r00 (16 bit)
+  rr = _mm256_cvtepu8_epi16(ref0);
+
+  diff = _mm256_sub_epi16(ss, rr);
+  *sse = _mm256_add_epi32(*sse, _mm256_madd_epi16(diff, diff));
+  *sum = _mm256_add_epi16(*sum, diff);
+}
+
 static INLINE void variance16_kernel_avx2(
     const uint8_t *const src, const int src_stride, const uint8_t *const ref,
     const int ref_stride, __m256i *const sse, __m256i *const sum) {
@@ -119,6 +154,21 @@ static INLINE void variance32_kernel_avx2(const uint8_t *const src,
   variance_kernel_avx2(s, r, sse, sum);
 }
 
+static INLINE void variance8_avx2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m256i *const vsse,
+                                  __m256i *const vsum) {
+  int i;
+  *vsum = _mm256_setzero_si256();
+  *vsse = _mm256_setzero_si256();
+
+  for (i = 0; i < h; i += 2) {
+    variance8_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
 static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
                                    const uint8_t *ref, const int ref_stride,
                                    const int h, __m256i *const vsse,
@@ -612,6 +662,36 @@ typedef void (*get_var_avx2)(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sse, int *sum);
 
+unsigned int vpx_variance8x4_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 4, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                  const uint8_t *ref_ptr, int ref_stride,
+                                  unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 8, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 6);
+}
+
+unsigned int vpx_variance8x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   unsigned int *sse) {
+  __m256i vsse, vsum;
+  int sum;
+  variance8_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 16, &vsse, &vsum);
+  variance_final_from_16bit_sum_avx2(vsse, vsum, sse, &sum);
+  return *sse - ((sum * sum) >> 7);
+}
+
 unsigned int vpx_variance16x8_avx2(const uint8_t *src_ptr, int src_stride,
                                    const uint8_t *ref_ptr, int ref_stride,
                                    unsigned int *sse) {
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index c7d880860..526c28382 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/x86/convolve.h"
 #include "vpx_dsp/x86/convolve_avx2.h"
 #include "vpx_dsp/x86/convolve_sse2.h"
+#include "vpx_dsp/x86/convolve_ssse3.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8
@@ -38,6 +39,27 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+#define CALC_CONVOLVE8_HORZ_ROW                                               \
+  srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);          \
+  s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]);                               \
+  s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]);                               \
+  s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]);                               \
+  s1[3] = _mm256_shuffle_epi8(srcReg, filt[3]);                               \
+  s1[0] = convolve8_16_avx2(s1, f1);                                          \
+  s1[0] = _mm256_packus_epi16(s1[0], s1[0]);                                  \
+  src_ptr += src_stride;                                                      \
+  _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(s1[0])); \
+  output_ptr += output_pitch;                                                 \
+  _mm_storel_epi64((__m128i *)&output_ptr[0],                                 \
+                   _mm256_extractf128_si256(s1[0], 1));                       \
+  output_ptr += output_pitch;
+
 static INLINE void vpx_filter_block1d16_h8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
     ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -61,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
     __m256i srcReg;
 
     // load the 2 strides of source
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -77,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg =
-        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
-    srcReg = _mm256_inserti128_si256(
-        srcReg,
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
-        1);
+    srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
 
     // filter the source buffer
     s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -97,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
 
     src_ptr += src_stride;
 
-    // average if necessary
-    outReg1 = _mm256_castsi256_si128(outReg32b1);
-    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+      outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
     }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + output_pitch), &outReg32b1);
     output_ptr += dst_stride;
   }
 
   // if the number of strides is odd.
   // process only 16 bytes
   if (i > 0) {
-    __m128i srcReg;
-
-    // load the first 16 bytes of the last row
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+    const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    const __m256i srcReg =
+        _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
 
     // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg1 = convolve8_8_avx2(s, f);
-
-    // reading the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+    s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+    s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+    s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+    s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
 
-    // filter the source buffer
-    s[0] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
-    s[1] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
-    s[2] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
-    s[3] = _mm256_castsi128_si256(
-        _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
-    outReg2 = convolve8_8_avx2(s, f);
+    // The low and high 128-bits of each lane contain the first and second
+    // convolve result respectively
+    outReg32b1 = convolve8_16_avx2(s, f);
+    outReg1 = _mm256_castsi256_si128(outReg32b1);
+    outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
 
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
+    // shrink to 8 bit each 16 bits
     outReg1 = _mm_packus_epi16(outReg1, outReg2);
 
     // average if necessary
@@ -177,11 +166,63 @@ static void vpx_filter_block1d16_h8_avg_avx2(
                                  output_height, filter, 1);
 }
 
+static void vpx_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i filt[4], f1[4], s1[4], srcReg;
+  __m128i f[4], s[4];
+  int y = output_height;
+
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  shuffle_filter_avx2(filter, f1);
+  filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
+  filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
+  filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
+  filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
+
+  // Process next 4 rows
+  while (y > 3) {
+    CALC_CONVOLVE8_HORZ_ROW
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 4;
+  }
+
+  // If remaining, then process 2 rows at a time
+  while (y > 1) {
+    CALC_CONVOLVE8_HORZ_ROW
+    y -= 2;
+  }
+
+  // For the remaining height.
+  if (y > 0) {
+    const __m128i src_reg_128 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+    f[0] = _mm256_castsi256_si128(f1[0]);
+    f[1] = _mm256_castsi256_si128(f1[1]);
+    f[2] = _mm256_castsi256_si128(f1[2]);
+    f[3] = _mm256_castsi256_si128(f1[3]);
+
+    // filter the source buffer
+    s[0] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[0]));
+    s[1] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[1]));
+    s[2] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[2]));
+    s[3] = _mm_shuffle_epi8(src_reg_128, _mm256_castsi256_si128(filt[3]));
+    s[0] = convolve8_8_ssse3(s, f);
+
+    // Saturate 16bit value to 8bit.
+    s[0] = _mm_packus_epi16(s[0], s[0]);
+
+    // Save only 8 bytes
+    _mm_storel_epi64((__m128i *)&output_ptr[0], s[0]);
+  }
+}
+
 static INLINE void vpx_filter_block1d16_v8_x_avx2(
     const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
     ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
     const int avg) {
-  __m128i outReg1, outReg2;
   __m256i srcRegHead1;
   unsigned int i;
   ptrdiff_t src_stride, dst_stride;
@@ -260,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     src_ptr += src_stride;
 
     // average if necessary
-    outReg1 = _mm256_castsi256_si128(s1[0]);
-    outReg2 = _mm256_extractf128_si256(s1[0], 1);
     if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-      outReg2 = _mm_avg_epu8(
-          outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+      const __m256i outReg = mm256_loadu2_si128(
+          (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+      s1[0] = _mm256_avg_epu8(s1[0], outReg);
     }
 
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-
-    // save the next 16 bits
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+    mm256_store2_si128((__m128i *)output_ptr,
+                       (__m128i *)(output_ptr + out_pitch), s1);
 
     output_ptr += dst_stride;
 
@@ -534,9 +570,6 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
   int h;
 
-  __m256i src_reg, src_reg_shift_0, src_reg_shift_2;
-  __m256i dst_reg;
-  __m256i tmp_0, tmp_1;
   __m256i idx_shift_0 =
       _mm256_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
                        2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
@@ -557,9 +590,11 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h >= 2; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
-    src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
-    src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
+    const __m256i src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride);
+    __m256i dst_reg;
+    __m256i tmp_0, tmp_1;
+    const __m256i src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0);
+    const __m256i src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2);
 
     // Get the output
     tmp_0 = _mm256_maddubs_epi16(src_reg_shift_0, kernel_reg_23);
@@ -580,9 +615,9 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
 
   // Repeat for the last row if needed
   if (h > 0) {
-    __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
+    const __m128i src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
     __m128i dst_reg;
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i tmp_0, tmp_1;
 
     __m128i src_reg_shift_0 =
@@ -596,7 +631,7 @@ static void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr,
                               _mm256_castsi256_si128(kernel_reg_45));
     dst_reg = _mm_adds_epi16(tmp_0, tmp_1);
 
-    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32, 6);
+    dst_reg = mm_round_epi16_sse2(&dst_reg, &reg_32_128, 6);
 
     dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128());
 
@@ -715,8 +750,6 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
   const ptrdiff_t unrolled_src_stride = src_stride << 1;
   const ptrdiff_t unrolled_dst_stride = dst_stride << 1;
 
-  __m256i src_reg, src_reg_shuf;
-  __m256i dst;
   __m256i shuf_idx =
       _mm256_setr_epi8(0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2,
                        3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6);
@@ -733,12 +766,12 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   for (h = height; h > 1; h -= 2) {
     // Load the source
-    src_reg = mm256_loadu2_epi64((const __m128i *)src_ptr,
-                                 (const __m128i *)(src_ptr + src_stride));
-    src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
+    const __m256i src_reg = mm256_loadu2_epi64(
+        (const __m128i *)src_ptr, (const __m128i *)(src_ptr + src_stride));
+    const __m256i src_reg_shuf = _mm256_shuffle_epi8(src_reg, shuf_idx);
 
     // Get the result
-    dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
+    __m256i dst = _mm256_maddubs_epi16(src_reg_shuf, kernel_reg);
     dst = _mm256_hadds_epi16(dst, _mm256_setzero_si256());
 
     // Round result
@@ -757,7 +790,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
   if (h > 0) {
     // Load the source
-    const __m128i reg_32 = _mm_set1_epi16(32);  // Used for rounding
+    const __m128i reg_32_128 = _mm_set1_epi16(32);  // Used for rounding
     __m128i src_reg = _mm_loadl_epi64((const __m128i *)src_ptr);
     __m128i src_reg_shuf =
         _mm_shuffle_epi8(src_reg, _mm256_castsi256_si128(shuf_idx));
@@ -768,7 +801,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
     dst = _mm_hadds_epi16(dst, _mm_setzero_si128());
 
     // Round result
-    dst = mm_round_epi16_sse2(&dst, &reg_32, 6);
+    dst = mm_round_epi16_sse2(&dst, &reg_32_128, 6);
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());
@@ -866,22 +899,399 @@ static void vpx_filter_block1d4_v4_avx2(const uint8_t *src_ptr,
   }
 }
 
+static void vpx_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[8];
+  __m128i s[9];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by two
+  const ptrdiff_t src_stride = src_pitch << 1;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
+  shuffle_filter_avx2(filter, f);
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  // merge the result together
+  // r[0]:    0 0 0 0 0 0 0 0 r17 r16 r15 r14 r13 r12 r11 r10 | 0 0 0 0 0 0 0 0
+  // r07 r06 r05 r04 r03 r02 r01 r00
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[1], 1);
+
+  // r[1]:    0 0 0 0 0 0 0 0 r27 r26 r25 r24 r23 r22 r21 r20 | 0 0 0 0 0 0 0 0
+  // r17 r16 r15 r14 r13 r12 r11 r10
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[2], 1);
+
+  // r[2]:    0 0 0 0 0 0 0 0 r37 r36 r35 r34 r33 r32 r31 r30 | 0 0 0 0 0 0 0 0
+  // r27 r26 r25 r24 r23 r22 r21 r20
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[3], 1);
+
+  // r[3]:    0 0 0 0 0 0 0 0 r47 r46 r45 r44 r43 r42 r41 r40 | 0 0 0 0 0 0 0 0
+  // r37 r36 r35 r34 r33 r32 r31 r30
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[4], 1);
+
+  // r[4]:    0 0 0 0 0 0 0 0 r57 r56 r55 r54 r53 r52 r51 r50 | 0 0 0 0 0 0 0 0
+  // r47 r46 r45 r44 r43 r42 r41 r40
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[5], 1);
+
+  // r[5]:    0 0 0 0 0 0 0 0 r67 r66 r65 r64 r63 r62 r61 r60 | 0 0 0 0 0 0 0 0
+  // r57 r56 r55 r54 r53 r52 r51 r50
+  r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[6], 1);
+
+  // Merge together
+  // ss[0]: |r27 r17|.......|r21 r11|r20 r10 || r17 r07|.....|r12 r02|r11
+  // r01|r10 r00|
+  ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+
+  // ss[0]: |r47 r37|.......|r41 r31|r40 r30 || r37 r27|.....|r32 r22|r31
+  // r21|r30 r20|
+  ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
+
+  // ss[2]: |r67 r57|.......|r61 r51|r60 r50 || r57 r47|.....|r52 r42|r51
+  // r41|r50 r40|
+  ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
+
+  // Process 2 rows at a time
+  do {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    // r[6]:    0 0 0 0 0 0 0 0 r77 r76 r75 r74 r73 r72 r71 r70 | 0 0 0 0 0 0 0
+    // 0 r67 r66 r65 r64 r63 r62 r61 r60
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[7], 1);
+    // r[7]:    0 0 0 0 0 0 0 0 r87 r86 r85 r84 r83 r82 r81 r80 | 0 0 0 0 0 0 0
+    // 0 r77 r76 r75 r74 r73 r72 r71 r70
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[8], 1);
+
+    // ss[3] : | r87 r77 | .......| r81 r71 | r80 r70 || r77 r67 | .....| r72
+    // r62 | r71 r61|r70 r60|
+    ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+    ss[0] = convolve8_16_avx2(ss, f);
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    /* shift down two rows */
+    s[6] = s[8];
+    _mm_storel_epi64((__m128i *)&output_ptr[0], _mm256_castsi256_si128(ss[0]));
+    output_ptr += out_pitch;
+    _mm_storel_epi64((__m128i *)&output_ptr[0],
+                     _mm256_extractf128_si256(ss[0], 1));
+    output_ptr += out_pitch;
+    ss[0] = ss[1];
+    ss[1] = ss[2];
+    ss[2] = ss[3];
+    y -= 2;
+  } while (y > 1);
+}
+
+static void vpx_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg64_256bit;
+  unsigned int y = output_height;
+
+  assert(output_height > 1);
+
+  addFilterReg64_256bit = _mm256_set1_epi16(32);
+
+  // f7 f6 f5 f4 f3 f2 f1 f0 (16 bit)
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  // f7 f6 f5 f4 f3 f2 f1 f0 || f7 f6 f5 f4 f3 f2 f1 f0 (8 bit each)
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  {
+    ptrdiff_t src_stride;
+    __m256i filt1Reg, filt2Reg, firstFilters, secondFilters;
+    // have the same data in both lanes of a 256 bit register
+    // f7 f6 f5 f4 f3 f2 f1 f0 f7 f6 f5 f4 f3 f2 f1 f0 | f7 f6 f5 f4 f3 f2 f1 f0
+    // f7 f6 f5 f4 f3 f2 f1 f0 (8bit each)
+    const __m256i filtersReg32 = _mm256_broadcastsi128_si256(filtersReg);
+
+    // duplicate only the first 32 bits
+    // f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0|f3 f2 f1 f0 | f3 f2 f1 f0|f3 f2 f1
+    // f0|f3 f2 f1 f0|f3 f2 f1 f0
+    firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+    // duplicate only the second 32 bits
+    // f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4|f7 f6 f5 f4 | f7 f6 f5 f4|f7 f6 f5
+    // f4|f7 f6 f5 f4|f7 f6 f5 f4
+    secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+    // s6 s5 s4 s3 s5 s4 s3 s2 s4 s3 s2 s1 s3 s2 s1 s0 | s6 s5 s4 s3 s5 s4 s3
+    // s2 s4 s3 s2 s1 s3 s2 s1 s0
+    filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+
+    // s10 s9 s8 s7 s9 s8 s7 s6 s8 s7 s6 s5 s7 s6 s5 s4 | s10 s9 s8 s7 s9 s8 s7
+    // s6 s8 s7 s6 s5 s7 s6 s5 s4
+    filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+    // multiple the size of the source and destination stride by two
+    src_stride = src_pitch << 1;
+
+    do {
+      __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcReg32b1;
+      // load the 2 strides of source
+      // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
+      // r06 r05 r04 r03 r02 r01 r00
+      srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
+
+      // filter the source buffer
+      // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
+      // r05 r04 r03 r05 r04 r03 r02 r04 r03 r02 r01 r03 r02 r01 r00
+      srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // ...|f3*r14+f2*r13|f1*r13+f0*r12|f3*r13+f2*r12|f1*r11+f0*r10||...
+      // |f1*r03+f0*r02|f3*r04+f2*r03|f1*r02+f0*r01|f3*r03+f2*r02|f1*r01+f0*r00
+      srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+      // filter the source buffer
+      // r110 r19 r18 r17|r19 r18 r17 r16|r18 r17 r16 r15|r17 r16 r15 r14||r010
+      // r09 r08 r07|r09 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+      // multiply 4 adjacent elements with the filter and add the result
+      // r010 r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04||r010
+      // r09 r08 r07|r9 r08 r07 r06|r08 r07 r06 r05|r07 r06 r05 r04
+      srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+      srcRegFilt32b1_1 =
+          _mm256_add_epi16(srcRegFilt32b1_1, addFilterReg64_256bit);
+      srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+      srcRegFilt32b1_1 =
+          _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      // 0 0 0 0 R13 R12 R11 R10 || 0 0 0 0 R03 R02 R01 R00 (16bit)
+      srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
+
+      // 8zeros 0 0 0 0 R13 R12 R11 R10 || 8zeros 0 0 0 0 R03 R02 R01 R00 (8bit)
+      srcRegFilt32b1_1 =
+          _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+      src_ptr += src_stride;
+      // save first row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_castsi256_si128(srcRegFilt32b1_1));
+      output_ptr += output_pitch;
+
+      // save second row 4 values
+      *((int *)&output_ptr[0]) =
+          _mm_cvtsi128_si32(_mm256_extractf128_si256(srcRegFilt32b1_1, 1));
+      output_ptr += output_pitch;
+
+      y = y - 2;
+    } while (y > 1);
+
+    // For remaining height
+    if (y > 0) {
+      __m128i srcReg1, srcRegFilt1_1, addFilterReg64;
+      __m128i srcRegFilt2;
+
+      addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
+
+      srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+
+      // filter the source buffer
+      srcRegFilt1_1 =
+          _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
+                                        _mm256_castsi256_si128(firstFilters));
+
+      // filter the source buffer
+      srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+      // multiply 4 adjacent elements with the filter and add the result
+      srcRegFilt2 =
+          _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+      srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+      // shift by 6 bit each 16 bit
+      srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
+      srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
+
+      // shrink to 8 bit each 16 bits, the first lane contain the first
+      // convolve result and the second lane contain the second convolve result
+      srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+      // save 4 bytes
+      *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+    }
+  }
+}
+
+static void vpx_filter_block1d4_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m256i f[4], ss[4];
+  __m256i r[9], rr[2];
+  __m128i s[11];
+
+  unsigned int y = output_height;
+  // Multiply the size of the source stride by four
+  const ptrdiff_t src_stride = src_pitch << 2;
+  const ptrdiff_t out_stride = out_pitch << 2;
+
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 0x01));
+
+  shuffle_filter_avx2(filter, f);
+
+  s[0] = _mm_loadl_epi64((const __m128i *)(src_ptr + 0 * src_pitch));
+  s[1] = _mm_loadl_epi64((const __m128i *)(src_ptr + 1 * src_pitch));
+  s[2] = _mm_loadl_epi64((const __m128i *)(src_ptr + 2 * src_pitch));
+  s[3] = _mm_loadl_epi64((const __m128i *)(src_ptr + 3 * src_pitch));
+  s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+  s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
+  s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
+
+  r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+  r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+  r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+  r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+  r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
+
+  // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
+  // r00|
+  ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+  // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+  // r11 r10 r03 r02 r01 r00
+  rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+  // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+  // r21 r20 r13 r12 r11 r10
+  rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
+
+  // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
+  // r20|
+  ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+  // Process 4 rows at a time
+  while (y >= 4) {
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+    s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
+    s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
+
+    r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+    r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+    rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+    ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+    r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+    rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+    rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+    ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+    ss[0] = convolve8_16_avx2(ss, f);
+
+    // r3 r2 r3 r2 r1 r0 r1 r0
+    ss[0] = _mm256_packus_epi16(ss[0], ss[0]);
+    src_ptr += src_stride;
+
+    mm256_storeu2_epi32((__m128i *const)output_ptr,
+                        (__m128i *const)(output_ptr + (2 * out_pitch)), ss);
+
+    ss[0] = _mm256_srli_si256(ss[0], 4);
+
+    mm256_storeu2_epi32((__m128i *const)(output_ptr + (1 * out_pitch)),
+                        (__m128i *const)(output_ptr + (3 * out_pitch)), ss);
+
+    output_ptr += out_stride;
+
+    ss[0] = ss[2];
+    ss[1] = ss[3];
+
+    s[6] = s[10];
+    s[5] = s[9];
+
+    r[4] = r[8];
+    y -= 4;
+  }
+
+  // Process 2 rows
+  if (y == 2) {
+    __m128i ss1[4], f1[4], r1[4];
+
+    s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
+    s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
+    s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
+
+    f1[0] = _mm256_castsi256_si128(f[0]);
+    f1[1] = _mm256_castsi256_si128(f[1]);
+    f1[2] = _mm256_castsi256_si128(f[2]);
+    f1[3] = _mm256_castsi256_si128(f[3]);
+
+    r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+    r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
+    // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
+    r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
+
+    // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
+    r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
+
+    // r23 r13....r20 r10|r13 r03....r10 r00
+    ss1[0] = _mm256_castsi256_si128(ss[0]);
+
+    // r43 r33....r40 r30|r33 r23....r30 r20
+    ss1[1] = _mm256_castsi256_si128(ss[1]);
+
+    // r63 r53....r60 r50|r53 r43....r50 r40
+    ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
+
+    // r83 r73....r80 r70|r73 r63....r70 r60
+    ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
+
+    ss1[0] = convolve8_8_ssse3(ss1, f1);
+
+    // r1 r0 r1 r0
+    ss1[0] = _mm_packus_epi16(ss1[0], ss1[0]);
+
+    // Save first row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+    output_ptr += out_pitch;
+
+    ss1[0] = _mm_srli_si128(ss1[0], 4);
+    // Save second row 4 values
+    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(ss1[0]);
+  }
+}
+
 #if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vpx_filter_block1d4_v8_ssse3;
 #if VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_intrin_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_intrin_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_intrin_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_intrin_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_intrin_ssse3
-#else  // VPX_ARCH_X86
+#else   // VPX_ARCH_X86
 filter8_1dfunction vpx_filter_block1d8_v8_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h8_ssse3;
-#define vpx_filter_block1d8_v8_avx2 vpx_filter_block1d8_v8_ssse3
-#define vpx_filter_block1d8_h8_avx2 vpx_filter_block1d8_h8_ssse3
-#define vpx_filter_block1d4_h8_avx2 vpx_filter_block1d4_h8_ssse3
 #endif  // VPX_ARCH_X86_64
 filter8_1dfunction vpx_filter_block1d8_v8_avg_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h8_avg_ssse3;
@@ -897,7 +1307,6 @@ filter8_1dfunction vpx_filter_block1d8_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d8_h2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_v2_ssse3;
 filter8_1dfunction vpx_filter_block1d4_h2_ssse3;
-#define vpx_filter_block1d4_v8_avx2 vpx_filter_block1d4_v8_ssse3
 #define vpx_filter_block1d16_v2_avx2 vpx_filter_block1d16_v2_ssse3
 #define vpx_filter_block1d16_h2_avx2 vpx_filter_block1d16_h2_ssse3
 #define vpx_filter_block1d8_v2_avx2 vpx_filter_block1d8_v2_ssse3
diff --git a/vpx_ports/aarch32_cpudetect.c b/vpx_ports/aarch32_cpudetect.c
new file mode 100644
index 000000000..639f4ff8e
--- /dev/null
+++ b/vpx_ports/aarch32_cpudetect.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+// Feature detection code for Armv7-A / AArch32.
+
+#include "./vpx_config.h"
+#include "arm_cpudetect.h"
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(_MSC_VER)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  // MSVC has no inline __asm support for Arm, but it does let you __emit
+  // instructions via their assembled hex code.
+  // All of these instructions should be essentially nops.
+  __try {
+    // VORR q0,q0,q0
+    __emit(0xF2200150);
+    flags |= HAS_NEON;
+  } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
+    // Ignore exception.
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON || HAVE_NEON_ASM
+  uint64_t features = android_getCpuFeatures();
+  if (features & ANDROID_CPU_ARM_FEATURE_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(AOM_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH32_HWCAP_NEON (1 << 12)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON || HAVE_NEON_ASM
+  if (hwcap & VPX_AARCH32_HWCAP_NEON) {
+    flags |= HAS_NEON;
+  }
+#endif  // HAVE_NEON || HAVE_NEON_ASM
+  return flags;
+}
+#else   // end __linux__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (arm_cpu_env_flags(&flags)) {
+    return flags;
+  }
+  return arm_get_cpu_caps() & arm_cpu_env_mask();
+}
diff --git a/vpx_ports/aarch64_cpudetect.c b/vpx_ports/aarch64_cpudetect.c
new file mode 100644
index 000000000..539d09bb3
--- /dev/null
+++ b/vpx_ports/aarch64_cpudetect.c
@@ -0,0 +1,199 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "arm_cpudetect.h"
+
+#if defined(__APPLE__)
+#include <sys/sysctl.h>
+#endif
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+static int arm_get_cpu_caps(void) {
+  // This function should actually be a no-op. There is no way to adjust any of
+  // these because the RTCD tables do not exist: the functions are called
+  // statically.
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__APPLE__)  // end !CONFIG_RUNTIME_CPU_DETECT
+
+// sysctlbyname() parameter documentation for instruction set characteristics:
+// https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
+static INLINE int64_t have_feature(const char *feature) {
+  int64_t feature_present = 0;
+  size_t size = sizeof(feature_present);
+  if (sysctlbyname(feature, &feature_present, &size, NULL, 0) != 0) {
+    return 0;
+  }
+  return feature_present;
+}
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (have_feature("hw.optional.arm.FEAT_DotProd")) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (have_feature("hw.optional.arm.FEAT_I8MM")) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+  return flags;
+}
+
+#elif defined(_WIN32)  // end __APPLE__
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+// IsProcessorFeaturePresent() parameter documentation:
+// https://learn.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-isprocessorfeaturepresent#parameters
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+// Support for PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE was added in Windows SDK
+// 20348, supported by Windows 11 and Windows Server 2022.
+#if defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+  if (IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE)
+#endif  // HAVE_NEON_DOTPROD
+  // No I8MM or SVE feature detection available on Windows at time of writing.
+  return flags;
+}
+
+#elif defined(ANDROID_USE_CPU_FEATURES_LIB)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  return flags;
+}
+
+#elif defined(__linux__)  // end defined(VPX_USE_ANDROID_CPU_FEATURES)
+
+#include <sys/auxv.h>
+
+// Define hwcap values ourselves: building with an old auxv header where these
+// hwcap values are not defined should not prevent features from being enabled.
+#define VPX_AARCH64_HWCAP_ASIMDDP (1 << 20)
+#define VPX_AARCH64_HWCAP_SVE (1 << 22)
+#define VPX_AARCH64_HWCAP2_I8MM (1 << 13)
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+  unsigned long hwcap = getauxval(AT_HWCAP);
+#if HAVE_NEON_I8MM
+  unsigned long hwcap2 = getauxval(AT_HWCAP2);
+#endif  // HAVE_NEON_I8MM
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+#if HAVE_NEON_DOTPROD
+  if (hwcap & VPX_AARCH64_HWCAP_ASIMDDP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (hwcap2 & VPX_AARCH64_HWCAP2_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (hwcap & VPX_AARCH64_HWCAP_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#elif defined(__Fuchsia__)  // end __linux__
+
+#include <zircon/features.h>
+#include <zircon/syscalls.h>
+
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/894282.
+#ifndef ZX_ARM64_FEATURE_ISA_I8MM
+#define ZX_ARM64_FEATURE_ISA_I8MM ((uint32_t)(1u << 19))
+#endif
+// Added in https://fuchsia-review.googlesource.com/c/fuchsia/+/895083.
+#ifndef ZX_ARM64_FEATURE_ISA_SVE
+#define ZX_ARM64_FEATURE_ISA_SVE ((uint32_t)(1u << 20))
+#endif
+
+static int arm_get_cpu_caps(void) {
+  int flags = 0;
+#if HAVE_NEON
+  flags |= HAS_NEON;  // Neon is mandatory in Armv8.0-A.
+#endif  // HAVE_NEON
+  uint32_t features;
+  zx_status_t status = zx_system_get_features(ZX_FEATURE_KIND_CPU, &features);
+  if (status != ZX_OK) {
+    return flags;
+  }
+#if HAVE_NEON_DOTPROD
+  if (features & ZX_ARM64_FEATURE_ISA_DP) {
+    flags |= HAS_NEON_DOTPROD;
+  }
+#endif  // HAVE_NEON_DOTPROD
+#if HAVE_NEON_I8MM
+  if (features & ZX_ARM64_FEATURE_ISA_I8MM) {
+    flags |= HAS_NEON_I8MM;
+  }
+#endif  // HAVE_NEON_I8MM
+#if HAVE_SVE
+  if (features & ZX_ARM64_FEATURE_ISA_SVE) {
+    flags |= HAS_SVE;
+  }
+#endif  // HAVE_SVE
+  return flags;
+}
+
+#else  // end __Fuchsia__
+#error \
+    "Runtime CPU detection selected, but no CPU detection method available" \
+"for your platform. Rerun configure with --disable-runtime-cpu-detect."
+#endif
+
+int arm_cpu_caps(void) {
+  int flags = 0;
+  if (!arm_cpu_env_flags(&flags)) {
+    flags = arm_get_cpu_caps() & arm_cpu_env_mask();
+  }
+
+  // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_NEON_I8MM;
+  }
+
+  // Restrict flags: FEAT_SVE assumes that FEAT_{DotProd,I8MM} are available.
+  if (!(flags & HAS_NEON_DOTPROD)) {
+    flags &= ~HAS_SVE;
+  }
+  if (!(flags & HAS_NEON_I8MM)) {
+    flags &= ~HAS_SVE;
+  }
+
+  return flags;
+}
diff --git a/vpx_ports/arm.h b/vpx_ports/arm.h
index 6458a2c5b..39365d18e 100644
--- a/vpx_ports/arm.h
+++ b/vpx_ports/arm.h
@@ -17,12 +17,14 @@
 extern "C" {
 #endif
 
-/*ARMv5TE "Enhanced DSP" instructions.*/
-#define HAS_EDSP 0x01
-/*ARMv6 "Parallel" or "Media" instructions.*/
-#define HAS_MEDIA 0x02
-/*ARMv7 optional NEON instructions.*/
-#define HAS_NEON 0x04
+// Armv7-A optional Neon instructions, mandatory from Armv8.0-A.
+#define HAS_NEON (1 << 0)
+// Armv8.2-A optional Neon dot-product instructions, mandatory from Armv8.4-A.
+#define HAS_NEON_DOTPROD (1 << 1)
+// Armv8.2-A optional Neon i8mm instructions, mandatory from Armv8.6-A.
+#define HAS_NEON_I8MM (1 << 2)
+// Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A.
+#define HAS_SVE (1 << 3)
 
 int arm_cpu_caps(void);
 
diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c
deleted file mode 100644
index 4f9d480ad..000000000
--- a/vpx_ports/arm_cpudetect.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "./vpx_config.h"
-#include "vpx_ports/arm.h"
-
-#ifdef WINAPI_FAMILY
-#include <winapifamily.h>
-#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#define getenv(x) NULL
-#endif
-#endif
-
-static int arm_cpu_env_flags(int *flags) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS");
-  if (env && *env) {
-    *flags = (int)strtol(env, NULL, 0);
-    return 0;
-  }
-  *flags = 0;
-  return -1;
-}
-
-static int arm_cpu_env_mask(void) {
-  char *env;
-  env = getenv("VPX_SIMD_CAPS_MASK");
-  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
-}
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-
-int arm_cpu_caps(void) {
-  /* This function should actually be a no-op. There is no way to adjust any of
-   * these because the RTCD tables do not exist: the functions are called
-   * statically */
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-#if HAVE_NEON || HAVE_NEON_ASM
-  flags |= HAS_NEON;
-#endif /* HAVE_NEON  || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */
-/*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#ifndef WIN32_EXTRA_LEAN
-#define WIN32_EXTRA_LEAN
-#endif
-#include <windows.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-/* MSVC has no inline __asm support for ARM, but it does let you __emit
- *  instructions via their assembled hex code.
- * All of these instructions should be essentially nops.
- */
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (mask & HAS_NEON) {
-    __try {
-      /*VORR q0,q0,q0*/
-      __emit(0xF2200150);
-      flags |= HAS_NEON;
-    } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) {
-      /*Ignore exception.*/
-    }
-  }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__ANDROID__) /* end _MSC_VER */
-#include <cpu-features.h>
-
-int arm_cpu_caps(void) {
-  int flags;
-  int mask;
-  uint64_t features;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  features = android_getCpuFeatures();
-
-#if HAVE_NEON || HAVE_NEON_ASM
-  if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON;
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-  return flags & mask;
-}
-
-#elif defined(__linux__) /* end __ANDROID__ */
-
-#include <stdio.h>
-
-int arm_cpu_caps(void) {
-  FILE *fin;
-  int flags;
-  int mask;
-  if (!arm_cpu_env_flags(&flags)) {
-    return flags;
-  }
-  mask = arm_cpu_env_mask();
-  /* Reading /proc/self/auxv would be easier, but that doesn't work reliably
-   *  on Android.
-   * This also means that detection will fail in Scratchbox.
-   */
-  fin = fopen("/proc/cpuinfo", "r");
-  if (fin != NULL) {
-    /* 512 should be enough for anybody (it's even enough for all the flags
-     * that x86 has accumulated... so far).
-     */
-    char buf[512];
-    while (fgets(buf, 511, fin) != NULL) {
-#if HAVE_NEON || HAVE_NEON_ASM
-      if (memcmp(buf, "Features", 8) == 0) {
-        char *p;
-        p = strstr(buf, " neon");
-        if (p != NULL && (p[5] == ' ' || p[5] == '\n')) {
-          flags |= HAS_NEON;
-        }
-      }
-#endif /* HAVE_NEON || HAVE_NEON_ASM */
-    }
-    fclose(fin);
-  }
-  return flags & mask;
-}
-#else  /* end __linux__ */
-#error \
-    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
-"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
-#endif
diff --git a/vpx_ports/arm_cpudetect.h b/vpx_ports/arm_cpudetect.h
new file mode 100644
index 000000000..881397abc
--- /dev/null
+++ b/vpx_ports/arm_cpudetect.h
@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx_config.h"
+#include "vpx_ports/arm.h"
+
+#if defined(_WIN32)
+#undef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#undef WIN32_EXTRA_LEAN
+#define WIN32_EXTRA_LEAN
+#include <windows.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#include <winapifamily.h>
+#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define getenv(x) NULL
+#endif
+#endif
+
+#if defined(__ANDROID__) && (__ANDROID_API__ < 18)
+#define ANDROID_USE_CPU_FEATURES_LIB 1
+// Use getauxval() when targeting (64-bit) Android with API level >= 18.
+// getauxval() is supported since Android API level 18 (Android 4.3.)
+// First Android version with 64-bit support was Android 5.x (API level 21).
+#include <cpu-features.h>
+#endif
+
+static INLINE int arm_cpu_env_flags(int *flags) {
+  const char *env = getenv("VPX_SIMD_CAPS");
+  if (env && *env) {
+    *flags = (int)strtol(env, NULL, 0);
+    return 1;
+  }
+  return 0;
+}
+
+static INLINE int arm_cpu_env_mask(void) {
+  const char *env = getenv("VPX_SIMD_CAPS_MASK");
+  return env && *env ? (int)strtol(env, NULL, 0) : ~0;
+}
diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index e30e87cef..93279dbeb 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk
@@ -36,7 +36,12 @@ PORTS_SRCS-yes += x86.h
 PORTS_SRCS-yes += x86_abi_support.asm
 endif
 
-PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.c
+ifeq ($(VPX_ARCH_AARCH64),yes)
+PORTS_SRCS-yes += aarch64_cpudetect.c
+else
+PORTS_SRCS-$(VPX_ARCH_ARM) += aarch32_cpudetect.c
+endif
+PORTS_SRCS-$(VPX_ARCH_ARM) += arm_cpudetect.h
 PORTS_SRCS-$(VPX_ARCH_ARM) += arm.h
 
 PORTS_SRCS-$(VPX_ARCH_PPC) += ppc_cpudetect.c
diff --git a/vpxdec.c b/vpxdec.c
index 84cef7dfd..bfe6c1d6b 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -446,7 +446,7 @@ static void generate_filename(const char *pattern, char *out, size_t q_len,
         case '7': snprintf(q, q_len - 1, "%07d", frame_in); break;
         case '8': snprintf(q, q_len - 1, "%08d", frame_in); break;
         case '9': snprintf(q, q_len - 1, "%09d", frame_in); break;
-        default: die("Unrecognized pattern %%%c\n", p[1]); break;
+        default: die("Unrecognized pattern %%%c\n", p[1]);
       }
 
       pat_len = strlen(q);
@@ -996,7 +996,7 @@ static int main_loop(int argc, const char **argv_) {
 
       if (single_file) {
         if (use_y4m) {
-          char buf[Y4M_BUFFER_SIZE] = { 0 };
+          char y4m_buf[Y4M_BUFFER_SIZE] = { 0 };
           size_t len = 0;
           if (img->fmt == VPX_IMG_FMT_I440 || img->fmt == VPX_IMG_FMT_I44016) {
             fprintf(stderr, "Cannot produce y4m output for 440 sampling.\n");
@@ -1005,21 +1005,22 @@ static int main_loop(int argc, const char **argv_) {
           if (frame_out == 1) {
             // Y4M file header
             len = y4m_write_file_header(
-                buf, sizeof(buf), vpx_input_ctx.width, vpx_input_ctx.height,
-                &vpx_input_ctx.framerate, img->fmt, img->bit_depth);
+                y4m_buf, sizeof(y4m_buf), vpx_input_ctx.width,
+                vpx_input_ctx.height, &vpx_input_ctx.framerate, img->fmt,
+                img->bit_depth);
             if (do_md5) {
-              MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+              MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
             } else {
-              fputs(buf, outfile);
+              fputs(y4m_buf, outfile);
             }
           }
 
           // Y4M frame header
-          len = y4m_write_frame_header(buf, sizeof(buf));
+          len = y4m_write_frame_header(y4m_buf, sizeof(y4m_buf));
           if (do_md5) {
-            MD5Update(&md5_ctx, (md5byte *)buf, (unsigned int)len);
+            MD5Update(&md5_ctx, (md5byte *)y4m_buf, (unsigned int)len);
           } else {
-            fputs(buf, outfile);
+            fputs(y4m_buf, outfile);
           }
         } else {
           if (frame_out == 1) {
diff --git a/vpxenc.c b/vpxenc.c
index 61672acad..d20bd3f96 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -883,7 +883,7 @@ static struct stream_state *new_stream(struct VpxEncoderConfig *global,
 
     /* Default lag_in_frames is 0 in realtime mode CBR mode*/
     if (global->deadline == VPX_DL_REALTIME &&
-        stream->config.cfg.rc_end_usage == 1)
+        stream->config.cfg.rc_end_usage == VPX_CBR)
       stream->config.cfg.g_lag_in_frames = 0;
   }
 
@@ -1586,13 +1586,14 @@ static void test_decode(struct stream_state *stream,
   /* Get the internal reference frame */
   if (strcmp(codec->name, "vp8") == 0) {
     struct vpx_ref_frame ref_enc, ref_dec;
-    int width, height;
+    int aligned_width = (stream->config.cfg.g_w + 15) & ~15;
+    int aligned_height = (stream->config.cfg.g_h + 15) & ~15;
 
-    width = (stream->config.cfg.g_w + 15) & ~15;
-    height = (stream->config.cfg.g_h + 15) & ~15;
-    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_enc.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     enc_img = ref_enc.img;
-    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, width, height, 1);
+    vpx_img_alloc(&ref_dec.img, VPX_IMG_FMT_I420, aligned_width, aligned_height,
+                  1);
     dec_img = ref_dec.img;
 
     ref_enc.frame_type = VP8_LAST_FRAME;
@@ -1969,10 +1970,9 @@ int main(int argc, const char **argv_) {
           } else {
             const int64_t input_pos = ftello(input.file);
             const int64_t input_pos_lagged = input_pos - lagged_count;
-            const int64_t limit = input.length;
 
             rate = cx_time ? input_pos_lagged * (int64_t)1000000 / cx_time : 0;
-            remaining = limit - input_pos + lagged_count;
+            remaining = input.length - input_pos + lagged_count;
           }
 
           average_rate =
diff --git a/y4minput.c b/y4minput.c
index 745e2f1cd..210ce52fc 100644
--- a/y4minput.c
+++ b/y4minput.c
@@ -1148,6 +1148,7 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
   _img->fmt = _y4m->vpx_fmt;
   _img->w = _img->d_w = _y4m->pic_w;
   _img->h = _img->d_h = _y4m->pic_h;
+  _img->bit_depth = _y4m->bit_depth;
   _img->x_chroma_shift = _y4m->dst_c_dec_h >> 1;
   _img->y_chroma_shift = _y4m->dst_c_dec_v >> 1;
   _img->bps = _y4m->bps;