update to v1.13.0

Current HEAD: d6eb9696aa72473c1a11d34d928d35a3acc0c9a9 git log from upstream: d6eb9696a Fix unsigned integer overflow in sse computation b5a2b3a92 Update AUTHORS .mailmap and version aa5b62236 Fix per frame qp for temporal layers 3f109f786 Update CHANGELOG 72cfcdd95 Skip calculating internal stats when frame dropped 67abc6738 Specialize Neon averaging subpel variance by filter value b7f6c6413 Refactor Neon averaging subpel variance functions ae5b60cb4 Specialize Neon subpel variance by filter value for large blocks fcfb471ce Refactor Neon subpel variance functions ae4240edc Add codec control to set per frame QP 0ce866562 Refactor Neon implementation of variance functions 5e8617953 */Android.mk: add a check for NDK_ROOT 71d01660c Fix to segfault for external resize test in vp9 59d4a6861 variance_test.cc: Enable HBDMse speed test. 32878bb1f variance_test.cc: Enable VpxHBDMseTest for C and SSE2. 5645938c3 Implement vertical convolutions using Neon USDOT instruction f95206869 Implement horizontal convolutions using Neon USDOT instruction e067469e7 build: replace egrep with grep -E 708c4aa85 Use Neon load/store helper functions consistently ab1192c29 Use lane-referencing intrinsics in Neon convolution kernels <...> Bug: https://crbug.com/webm/1780 Test: presubmit, builds for aosp_arm-eng, aosp_arm64-eng, \ aosp_x86-eng, aosp_x86_64-eng, aosp_barbet-userdebug, \ aosp_oriole-userdebug \ panther_hwasan-userdebug: \ - vts-tradefed run commandAndExit vts \ --include-filter VtsHalMediaC2V1_0TargetVideoDecTest \ --module-arg "VtsHalMediaC2V1_0TargetVideoDecTest:include-filter:*vp8*" \ --module-arg "VtsHalMediaC2V1_0TargetVideoDecTest:include-filter:*vp9*" \ --include-filter VtsHalMediaC2V1_0TargetVideoEncTest \ --module-arg "VtsHalMediaC2V1_0TargetVideoEncTest:include-filter:*vp8*" \ --module-arg "VtsHalMediaC2V1_0TargetVideoEncTest:include-filter:*vp9*" \ - cts-tradefed run commandAndExit cts-dev \ --include-filter CtsMediaDecoderTestCases \ --module-arg "CtsMediaDecoderTestCases:include-filter:.*(?i:vp[89])" \ --include-filter CtsMediaV2TestCases \ --module-arg "CtsMediaV2TestCases:include-filter:.*(?i:vp[89])" \ --include-filter CtsMediaEncoderTestCases \ --module-arg "CtsMediaEncoderTestCases:include-filter:.*(?i:vp[89])" \ --include-filter CtsVideoTestCases \ --module-arg "CtsVideoTestCases:include-filter:.*(?i:vp[89])" Change-Id: Iac91cb8b90a745a836a22109e52658a809642414 Merged-In: Iac91cb8b90a745a836a22109e52658a809642414
author: James Zern <jzern@google.com> 2023-03-01 18:58:46 -0800
committer: James Zern <jzern@google.com> 2023-03-01 19:16:38 -0800
commit: 1b4678c67ec028c753928b70fdc10aaa5c78bffa (patch)
tree: 68b49c6ec3b33474b68c406d45472a54d6f694e8
parent: 6939824c0cf8321a1718973892371085f7b4edff (diff)
download: libvpx-1b4678c67ec028c753928b70fdc10aaa5c78bffa.tar.gz
236 files changed, 19607 insertions, 9055 deletions
diff --git a/Android.bp b/Android.bp
index 8708fa18b..c63961b5a 100644
--- a/Android.bp
+++ b/Android.bp
@@ -109,6 +109,7 @@ libvpx_arm_neon_c_srcs = [
     "libvpx/vp9/decoder/vp9_dsubexp.c",
     "libvpx/vp9/decoder/vp9_job_queue.c",
     "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -153,9 +154,9 @@ libvpx_arm_neon_c_srcs = [
     "libvpx/vpx_dsp/arm/avg_pred_neon.c",
     "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
     "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
-    "libvpx/vpx_dsp/arm/fdct_neon.c",
+    "libvpx/vpx_dsp/arm/fdct4x4_neon.c",
+    "libvpx/vpx_dsp/arm/fdct8x8_neon.c",
     "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
-    "libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
     "libvpx/vpx_dsp/arm/hadamard_neon.c",
     "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
     "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
@@ -166,6 +167,9 @@ libvpx_arm_neon_c_srcs = [
     "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
     "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
     "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_quantize_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_sad_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_variance_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
@@ -349,6 +353,7 @@ libvpx_arm64_c_srcs = [
     "libvpx/vp9/decoder/vp9_dsubexp.c",
     "libvpx/vp9/decoder/vp9_job_queue.c",
     "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -393,9 +398,9 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_dsp/arm/avg_pred_neon.c",
     "libvpx/vpx_dsp/arm/fdct16x16_neon.c",
     "libvpx/vpx_dsp/arm/fdct32x32_neon.c",
-    "libvpx/vpx_dsp/arm/fdct_neon.c",
+    "libvpx/vpx_dsp/arm/fdct4x4_neon.c",
+    "libvpx/vpx_dsp/arm/fdct8x8_neon.c",
     "libvpx/vpx_dsp/arm/fdct_partial_neon.c",
-    "libvpx/vpx_dsp/arm/fwd_txfm_neon.c",
     "libvpx/vpx_dsp/arm/hadamard_neon.c",
     "libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c",
     "libvpx/vpx_dsp/arm/highbd_idct32x32_1024_add_neon.c",
@@ -406,6 +411,9 @@ libvpx_arm64_c_srcs = [
     "libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c",
     "libvpx/vpx_dsp/arm/highbd_intrapred_neon.c",
     "libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_quantize_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_sad_neon.c",
+    "libvpx/vpx_dsp/arm/highbd_variance_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c",
     "libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c",
@@ -746,6 +754,7 @@ libvpx_x86_c_srcs = [
     "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
     "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
+    "libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c",
     "libvpx/vp9/vp9_cx_iface.c",
     "libvpx/vp9/vp9_dx_iface.c",
     "libvpx/vp9/vp9_iface_common.c",
@@ -981,6 +990,7 @@ libvpx_x86_64_c_srcs = [
     "libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c",
     "libvpx/vp9/encoder/x86/vp9_highbd_block_error_intrin_sse2.c",
     "libvpx/vp9/encoder/x86/vp9_quantize_sse2.c",
+    "libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c",
     "libvpx/vp9/vp9_cx_iface.c",
     "libvpx/vp9/vp9_dx_iface.c",
     "libvpx/vp9/vp9_iface_common.c",
@@ -1062,7 +1072,6 @@ libvpx_x86_64_asm_srcs = [
     "libvpx/vp8/encoder/x86/fwalsh_sse2.asm",
     "libvpx/vp9/encoder/x86/vp9_dct_sse2.asm",
     "libvpx/vp9/encoder/x86/vp9_error_sse2.asm",
-    "libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm",
     "libvpx/vpx_dsp/x86/add_noise_sse2.asm",
     "libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm",
     "libvpx/vpx_dsp/x86/deblock_sse2.asm",
diff --git a/README.android b/README.android
index 38780acec..4fb133063 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
 Name: libvpx
 URL: http://www.webmproject.org
-Version: v1.12.0
+Version: v1.13.0
 License: BSD
 License File: libvpx/LICENSE
 
-Date: Thursday June 30 2022
-Branch: origin/torrent
-Commit: 03265cd42b3783532de72f2ded5436652e6f5ce3
+Date: Wednesday March 01 2023
+Branch: ugly-duckling
+Commit: d6eb9696aa72473c1a11d34d928d35a3acc0c9a9
 
 Description:
 Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index 7dfba96ef..9858b39c9 100644
--- a/README.version
+++ b/README.version
@@ -1,5 +1,5 @@
-URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.12.0.tar.gz
-Version: v1.12.0
+URL: https://chromium.googlesource.com/webm/libvpx/
+Version: v1.13.0
 BugComponent: 42195
 Owners: jzern, jianj
 Local Modifications:
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index 01065e667..b2b2fc2dc 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
 #define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
 
 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
 
 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
@@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
 
 void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
 #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index 99abbb974..565105892 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
 
 void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
 
 void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
@@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon
 
 void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon
 
 unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon
 
 unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon
 
 unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon
 
 unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon
 
 unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon
 
 unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon
 
 unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon
 
 unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon
 
 unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon
 
 unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon
 
 unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon
 
 unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon
 
 unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon
 
 unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon
 
 unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon
 
 unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon
 
 void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon
 
 void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon
 
 unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon
 
 unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon
 
 unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon
 
 unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon
 
 unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon
 
 unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon
 
 unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon
 
 unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon
 
 unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon
 
 unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon
 
 unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon
 
 unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon
 
 unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon
 
 unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon
 
 unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon
 
 unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon
 
 void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon
 
 void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon
 
 unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon
 
 unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon
 
 unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon
 
 unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon
 
 unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon
 
 unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon
 
 unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon
 
 unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon
 
 unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon
 
 unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon
 
 unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon
 
 unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon
 
 unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon
 
 unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon
 
 unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon
 
 unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
 
 unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
 #define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
@@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
 #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon
 
 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
@@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const
 #define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
 
 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon
 
 void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon
 
 void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon
 
 void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon
 
 void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon
 
 void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon
 
 void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon
 
 void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
 
 void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon
 
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
 
 void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon
 
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
 
 void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon
 
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
 
 void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon
 
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
 
 void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon
 
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
 
 void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon
 
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
 
 void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon
 
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
 
 void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon
 
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
 
 void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon
 
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
 
 void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon
 
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
 
 void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon
 
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
 
 void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon
 
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
 
 void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon
 
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
 
 void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
 #define vpx_highbd_satd vpx_highbd_satd_c
 
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
-#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon
 
 void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index a90ab60d9..a58bfac01 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index 01065e667..b2b2fc2dc 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -38,7 +38,8 @@ int64_t vp9_block_error_fp_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
 #define vp9_block_error_fp vp9_block_error_fp_c
 
 int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
-#define vp9_diamond_search_sad vp9_diamond_search_sad_c
+int vp9_diamond_search_sad_neon(const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv);
+#define vp9_diamond_search_sad vp9_diamond_search_sad_neon
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
@@ -62,7 +63,8 @@ void vp9_highbd_fht16x16_c(const int16_t *input, tran_low_t *output, int stride,
 #define vp9_highbd_fht16x16 vp9_highbd_fht16x16_c
 
 void vp9_highbd_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_c
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_highbd_fht4x4 vp9_highbd_fht4x4_neon
 
 void vp9_highbd_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
 #define vp9_highbd_fht8x8 vp9_highbd_fht8x8_c
@@ -83,10 +85,12 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_neon
 
 void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
+void vp9_highbd_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_neon
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
 #define vp9_highbd_temporal_filter_apply vp9_highbd_temporal_filter_apply_c
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index 99abbb974..565105892 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -287,7 +287,8 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran
 #define vpx_hadamard_16x16 vpx_hadamard_16x16_neon
 
 void vpx_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
-#define vpx_hadamard_32x32 vpx_hadamard_32x32_c
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
+#define vpx_hadamard_32x32 vpx_hadamard_32x32_neon
 
 void vpx_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
 void vpx_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff);
@@ -297,409 +298,544 @@ void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, const uint8_t *above
 #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c
 
 void vpx_highbd_10_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_c
+void vpx_highbd_10_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get16x16var vpx_highbd_10_get16x16var_neon
 
 void vpx_highbd_10_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_c
+void vpx_highbd_10_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_10_get8x8var vpx_highbd_10_get8x8var_neon
 
 unsigned int vpx_highbd_10_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_c
+unsigned int vpx_highbd_10_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x16 vpx_highbd_10_mse16x16_neon
 
 unsigned int vpx_highbd_10_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_c
+unsigned int vpx_highbd_10_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse16x8 vpx_highbd_10_mse16x8_neon
 
 unsigned int vpx_highbd_10_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_c
+unsigned int vpx_highbd_10_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x16 vpx_highbd_10_mse8x16_neon
 
 unsigned int vpx_highbd_10_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_c
+unsigned int vpx_highbd_10_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_mse8x8 vpx_highbd_10_mse8x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x16 vpx_highbd_10_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x32 vpx_highbd_10_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance16x8 vpx_highbd_10_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x16 vpx_highbd_10_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x32 vpx_highbd_10_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance32x64 vpx_highbd_10_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x4 vpx_highbd_10_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance4x8 vpx_highbd_10_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x32 vpx_highbd_10_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance64x64 vpx_highbd_10_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x16 vpx_highbd_10_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x4 vpx_highbd_10_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_10_sub_pixel_avg_variance8x8 vpx_highbd_10_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x16 vpx_highbd_10_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x32 vpx_highbd_10_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance16x8 vpx_highbd_10_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x16 vpx_highbd_10_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x32 vpx_highbd_10_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance32x64 vpx_highbd_10_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x4 vpx_highbd_10_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance4x8 vpx_highbd_10_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x32 vpx_highbd_10_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_10_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance64x64 vpx_highbd_10_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x16 vpx_highbd_10_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x4 vpx_highbd_10_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_10_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_10_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_10_sub_pixel_variance8x8 vpx_highbd_10_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_10_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_c
+unsigned int vpx_highbd_10_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x16 vpx_highbd_10_variance16x16_neon
 
 unsigned int vpx_highbd_10_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_c
+unsigned int vpx_highbd_10_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x32 vpx_highbd_10_variance16x32_neon
 
 unsigned int vpx_highbd_10_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_c
+unsigned int vpx_highbd_10_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance16x8 vpx_highbd_10_variance16x8_neon
 
 unsigned int vpx_highbd_10_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_c
+unsigned int vpx_highbd_10_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x16 vpx_highbd_10_variance32x16_neon
 
 unsigned int vpx_highbd_10_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_c
+unsigned int vpx_highbd_10_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x32 vpx_highbd_10_variance32x32_neon
 
 unsigned int vpx_highbd_10_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_c
+unsigned int vpx_highbd_10_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance32x64 vpx_highbd_10_variance32x64_neon
 
 unsigned int vpx_highbd_10_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_c
+unsigned int vpx_highbd_10_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x4 vpx_highbd_10_variance4x4_neon
 
 unsigned int vpx_highbd_10_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_c
+unsigned int vpx_highbd_10_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance4x8 vpx_highbd_10_variance4x8_neon
 
 unsigned int vpx_highbd_10_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_c
+unsigned int vpx_highbd_10_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x32 vpx_highbd_10_variance64x32_neon
 
 unsigned int vpx_highbd_10_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_c
+unsigned int vpx_highbd_10_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance64x64 vpx_highbd_10_variance64x64_neon
 
 unsigned int vpx_highbd_10_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_c
+unsigned int vpx_highbd_10_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x16 vpx_highbd_10_variance8x16_neon
 
 unsigned int vpx_highbd_10_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_c
+unsigned int vpx_highbd_10_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x4 vpx_highbd_10_variance8x4_neon
 
 unsigned int vpx_highbd_10_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_c
+unsigned int vpx_highbd_10_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_10_variance8x8 vpx_highbd_10_variance8x8_neon
 
 void vpx_highbd_12_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_c
+void vpx_highbd_12_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get16x16var vpx_highbd_12_get16x16var_neon
 
 void vpx_highbd_12_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_c
+void vpx_highbd_12_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_12_get8x8var vpx_highbd_12_get8x8var_neon
 
 unsigned int vpx_highbd_12_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_c
+unsigned int vpx_highbd_12_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x16 vpx_highbd_12_mse16x16_neon
 
 unsigned int vpx_highbd_12_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_c
+unsigned int vpx_highbd_12_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse16x8 vpx_highbd_12_mse16x8_neon
 
 unsigned int vpx_highbd_12_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_c
+unsigned int vpx_highbd_12_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x16 vpx_highbd_12_mse8x16_neon
 
 unsigned int vpx_highbd_12_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_c
+unsigned int vpx_highbd_12_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_mse8x8 vpx_highbd_12_mse8x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x16 vpx_highbd_12_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x32 vpx_highbd_12_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance16x8 vpx_highbd_12_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x16 vpx_highbd_12_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x32 vpx_highbd_12_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance32x64 vpx_highbd_12_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x4 vpx_highbd_12_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance4x8 vpx_highbd_12_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x32 vpx_highbd_12_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance64x64 vpx_highbd_12_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x16 vpx_highbd_12_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x4 vpx_highbd_12_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_12_sub_pixel_avg_variance8x8 vpx_highbd_12_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x16 vpx_highbd_12_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x32 vpx_highbd_12_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance16x8 vpx_highbd_12_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x16 vpx_highbd_12_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x32 vpx_highbd_12_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance32x64 vpx_highbd_12_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x4 vpx_highbd_12_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance4x8 vpx_highbd_12_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x32 vpx_highbd_12_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_12_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance64x64 vpx_highbd_12_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x16 vpx_highbd_12_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x4 vpx_highbd_12_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_12_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_12_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_12_sub_pixel_variance8x8 vpx_highbd_12_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_12_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_c
+unsigned int vpx_highbd_12_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x16 vpx_highbd_12_variance16x16_neon
 
 unsigned int vpx_highbd_12_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_c
+unsigned int vpx_highbd_12_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x32 vpx_highbd_12_variance16x32_neon
 
 unsigned int vpx_highbd_12_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_c
+unsigned int vpx_highbd_12_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance16x8 vpx_highbd_12_variance16x8_neon
 
 unsigned int vpx_highbd_12_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_c
+unsigned int vpx_highbd_12_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x16 vpx_highbd_12_variance32x16_neon
 
 unsigned int vpx_highbd_12_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_c
+unsigned int vpx_highbd_12_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x32 vpx_highbd_12_variance32x32_neon
 
 unsigned int vpx_highbd_12_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_c
+unsigned int vpx_highbd_12_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance32x64 vpx_highbd_12_variance32x64_neon
 
 unsigned int vpx_highbd_12_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_c
+unsigned int vpx_highbd_12_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x4 vpx_highbd_12_variance4x4_neon
 
 unsigned int vpx_highbd_12_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_c
+unsigned int vpx_highbd_12_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance4x8 vpx_highbd_12_variance4x8_neon
 
 unsigned int vpx_highbd_12_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_c
+unsigned int vpx_highbd_12_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x32 vpx_highbd_12_variance64x32_neon
 
 unsigned int vpx_highbd_12_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_c
+unsigned int vpx_highbd_12_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance64x64 vpx_highbd_12_variance64x64_neon
 
 unsigned int vpx_highbd_12_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_c
+unsigned int vpx_highbd_12_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x16 vpx_highbd_12_variance8x16_neon
 
 unsigned int vpx_highbd_12_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_c
+unsigned int vpx_highbd_12_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x4 vpx_highbd_12_variance8x4_neon
 
 unsigned int vpx_highbd_12_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_c
+unsigned int vpx_highbd_12_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_12_variance8x8 vpx_highbd_12_variance8x8_neon
 
 void vpx_highbd_8_get16x16var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_c
+void vpx_highbd_8_get16x16var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get16x16var vpx_highbd_8_get16x16var_neon
 
 void vpx_highbd_8_get8x8var_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
-#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_c
+void vpx_highbd_8_get8x8var_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum);
+#define vpx_highbd_8_get8x8var vpx_highbd_8_get8x8var_neon
 
 unsigned int vpx_highbd_8_mse16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_c
+unsigned int vpx_highbd_8_mse16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x16 vpx_highbd_8_mse16x16_neon
 
 unsigned int vpx_highbd_8_mse16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_c
+unsigned int vpx_highbd_8_mse16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse16x8 vpx_highbd_8_mse16x8_neon
 
 unsigned int vpx_highbd_8_mse8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_c
+unsigned int vpx_highbd_8_mse8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x16 vpx_highbd_8_mse8x16_neon
 
 unsigned int vpx_highbd_8_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_c
+unsigned int vpx_highbd_8_mse8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_mse8x8 vpx_highbd_8_mse8x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x16 vpx_highbd_8_sub_pixel_avg_variance16x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x32 vpx_highbd_8_sub_pixel_avg_variance16x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance16x8 vpx_highbd_8_sub_pixel_avg_variance16x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x16 vpx_highbd_8_sub_pixel_avg_variance32x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x32 vpx_highbd_8_sub_pixel_avg_variance32x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance32x64 vpx_highbd_8_sub_pixel_avg_variance32x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x4 vpx_highbd_8_sub_pixel_avg_variance4x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance4x8 vpx_highbd_8_sub_pixel_avg_variance4x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x32 vpx_highbd_8_sub_pixel_avg_variance64x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance64x64 vpx_highbd_8_sub_pixel_avg_variance64x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x16 vpx_highbd_8_sub_pixel_avg_variance8x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x4 vpx_highbd_8_sub_pixel_avg_variance8x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
-#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred);
+#define vpx_highbd_8_sub_pixel_avg_variance8x8 vpx_highbd_8_sub_pixel_avg_variance8x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x16 vpx_highbd_8_sub_pixel_variance16x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x32 vpx_highbd_8_sub_pixel_variance16x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance16x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance16x8 vpx_highbd_8_sub_pixel_variance16x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x16 vpx_highbd_8_sub_pixel_variance32x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x32 vpx_highbd_8_sub_pixel_variance32x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance32x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance32x64 vpx_highbd_8_sub_pixel_variance32x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x4 vpx_highbd_8_sub_pixel_variance4x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance4x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance4x8 vpx_highbd_8_sub_pixel_variance4x8_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x32_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x32 vpx_highbd_8_sub_pixel_variance64x32_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_c
+uint32_t vpx_highbd_8_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance64x64 vpx_highbd_8_sub_pixel_variance64x64_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x16_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x16 vpx_highbd_8_sub_pixel_variance8x16_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x4_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x4 vpx_highbd_8_sub_pixel_variance8x4_neon
 
 uint32_t vpx_highbd_8_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
-#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_c
+uint32_t vpx_highbd_8_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse);
+#define vpx_highbd_8_sub_pixel_variance8x8 vpx_highbd_8_sub_pixel_variance8x8_neon
 
 unsigned int vpx_highbd_8_variance16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_c
+unsigned int vpx_highbd_8_variance16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x16 vpx_highbd_8_variance16x16_neon
 
 unsigned int vpx_highbd_8_variance16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_c
+unsigned int vpx_highbd_8_variance16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x32 vpx_highbd_8_variance16x32_neon
 
 unsigned int vpx_highbd_8_variance16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_c
+unsigned int vpx_highbd_8_variance16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance16x8 vpx_highbd_8_variance16x8_neon
 
 unsigned int vpx_highbd_8_variance32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_c
+unsigned int vpx_highbd_8_variance32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x16 vpx_highbd_8_variance32x16_neon
 
 unsigned int vpx_highbd_8_variance32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_c
+unsigned int vpx_highbd_8_variance32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x32 vpx_highbd_8_variance32x32_neon
 
 unsigned int vpx_highbd_8_variance32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_c
+unsigned int vpx_highbd_8_variance32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance32x64 vpx_highbd_8_variance32x64_neon
 
 unsigned int vpx_highbd_8_variance4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_c
+unsigned int vpx_highbd_8_variance4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x4 vpx_highbd_8_variance4x4_neon
 
 unsigned int vpx_highbd_8_variance4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_c
+unsigned int vpx_highbd_8_variance4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance4x8 vpx_highbd_8_variance4x8_neon
 
 unsigned int vpx_highbd_8_variance64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_c
+unsigned int vpx_highbd_8_variance64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x32 vpx_highbd_8_variance64x32_neon
 
 unsigned int vpx_highbd_8_variance64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_c
+unsigned int vpx_highbd_8_variance64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance64x64 vpx_highbd_8_variance64x64_neon
 
 unsigned int vpx_highbd_8_variance8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_c
+unsigned int vpx_highbd_8_variance8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x16 vpx_highbd_8_variance8x16_neon
 
 unsigned int vpx_highbd_8_variance8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_c
+unsigned int vpx_highbd_8_variance8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x4 vpx_highbd_8_variance8x4_neon
 
 unsigned int vpx_highbd_8_variance8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
-#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_c
+unsigned int vpx_highbd_8_variance8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
+#define vpx_highbd_8_variance8x8 vpx_highbd_8_variance8x8_neon
 
 unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p);
 #define vpx_highbd_avg_4x4 vpx_highbd_avg_4x4_c
@@ -708,7 +844,8 @@ unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p);
 #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_c
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_neon
 
 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
 void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
@@ -887,25 +1024,32 @@ void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, const
 #define vpx_highbd_dc_top_predictor_8x8 vpx_highbd_dc_top_predictor_8x8_neon
 
 void vpx_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_c
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16 vpx_highbd_fdct16x16_neon
 
 void vpx_highbd_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_c
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct16x16_1 vpx_highbd_fdct16x16_1_neon
 
 void vpx_highbd_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_c
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32 vpx_highbd_fdct32x32_neon
 
 void vpx_highbd_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_c
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_1 vpx_highbd_fdct32x32_1_neon
 
 void vpx_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_c
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct32x32_rd vpx_highbd_fdct32x32_rd_neon
 
 void vpx_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_c
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct4x4 vpx_highbd_fdct4x4_neon
 
 void vpx_highbd_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride);
-#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_c
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride);
+#define vpx_highbd_fdct8x8 vpx_highbd_fdct8x8_neon
 
 void vpx_highbd_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride);
 void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride);
@@ -1046,133 +1190,175 @@ void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b vpx_highbd_quantize_b_neon
 
 void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
+void vpx_highbd_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_neon
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_c
+unsigned int vpx_highbd_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x16 vpx_highbd_sad16x16_neon
 
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
+unsigned int vpx_highbd_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_neon
 
 void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
+void vpx_highbd_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_neon
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_c
+unsigned int vpx_highbd_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x32 vpx_highbd_sad16x32_neon
 
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
+unsigned int vpx_highbd_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_neon
 
 void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
+void vpx_highbd_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_neon
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_c
+unsigned int vpx_highbd_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad16x8 vpx_highbd_sad16x8_neon
 
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
+unsigned int vpx_highbd_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_neon
 
 void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
+void vpx_highbd_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_neon
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_c
+unsigned int vpx_highbd_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x16 vpx_highbd_sad32x16_neon
 
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
+unsigned int vpx_highbd_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_neon
 
 void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
+void vpx_highbd_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_neon
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_c
+unsigned int vpx_highbd_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x32 vpx_highbd_sad32x32_neon
 
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
+unsigned int vpx_highbd_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_neon
 
 void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
+void vpx_highbd_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_neon
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_c
+unsigned int vpx_highbd_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad32x64 vpx_highbd_sad32x64_neon
 
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
+unsigned int vpx_highbd_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_neon
 
 void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
+void vpx_highbd_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_neon
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_c
+unsigned int vpx_highbd_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x4 vpx_highbd_sad4x4_neon
 
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
+unsigned int vpx_highbd_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_neon
 
 void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
+void vpx_highbd_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_neon
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_c
+unsigned int vpx_highbd_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad4x8 vpx_highbd_sad4x8_neon
 
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
+unsigned int vpx_highbd_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_neon
 
 void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
+void vpx_highbd_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_neon
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_c
+unsigned int vpx_highbd_sad64x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x32 vpx_highbd_sad64x32_neon
 
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
+unsigned int vpx_highbd_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_neon
 
 void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
+void vpx_highbd_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_neon
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_c
+unsigned int vpx_highbd_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad64x64 vpx_highbd_sad64x64_neon
 
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
+unsigned int vpx_highbd_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_neon
 
 void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
+void vpx_highbd_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_neon
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_c
+unsigned int vpx_highbd_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x16 vpx_highbd_sad8x16_neon
 
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
+unsigned int vpx_highbd_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_neon
 
 void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
+void vpx_highbd_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_neon
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_c
+unsigned int vpx_highbd_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x4 vpx_highbd_sad8x4_neon
 
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
+unsigned int vpx_highbd_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_neon
 
 void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
+void vpx_highbd_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_neon
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
-#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_c
+unsigned int vpx_highbd_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
+#define vpx_highbd_sad8x8 vpx_highbd_sad8x8_neon
 
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
-#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
+unsigned int vpx_highbd_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
+#define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_neon
 
 void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
-#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
+void vpx_highbd_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+#define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_neon
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
 #define vpx_highbd_satd vpx_highbd_satd_c
 
 void vpx_highbd_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
-#define vpx_highbd_subtract_block vpx_highbd_subtract_block_c
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd);
+#define vpx_highbd_subtract_block vpx_highbd_subtract_block_neon
 
 void vpx_highbd_tm_predictor_16x16_c(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
 void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd);
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index a90ab60d9..a58bfac01 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index a90ab60d9..a58bfac01 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index cff5e7f63..580d55a28 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -106,10 +106,12 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp vp9_quantize_fp_sse2
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp vp9_quantize_fp_ssse3
 
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
 void vp9_scale_and_extend_frame_ssse3(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index 8b94dd89f..91242deee 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -833,7 +833,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p);
 #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2
 
 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
 #define vpx_highbd_convolve8 vpx_highbd_convolve8_c
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index a90ab60d9..a58bfac01 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 284453f06..22401f1c0 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -834,7 +834,8 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p);
 #define vpx_highbd_avg_8x8 vpx_highbd_avg_8x8_sse2
 
 void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
-#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_c
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride);
+#define vpx_highbd_comp_avg_pred vpx_highbd_comp_avg_pred_sse2
 
 void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
 void vpx_highbd_convolve8_sse2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd);
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index a90ab60d9..a58bfac01 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  12
+#define VERSION_MINOR  13
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.12.0"
-#define VERSION_STRING      " v1.12.0"
+#define VERSION_STRING_NOSP "v1.13.0"
+#define VERSION_STRING      " v1.13.0"
diff --git a/libvpx/.clang-format b/libvpx/.clang-format
index 866b7e211..a8bc4967c 100644
--- a/libvpx/.clang-format
+++ b/libvpx/.clang-format
@@ -1,149 +1,9 @@
 ---
 Language:        Cpp
-# BasedOnStyle:  Google
-# Generated with clang-format 7.0.1
-AccessModifierOffset: -1
-AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignEscapedNewlines: Left
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
+BasedOnStyle:  Google
 AllowShortCaseLabelsOnASingleLine: true
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: true
-AllowShortLoopsOnASingleLine: true
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: true
-AlwaysBreakTemplateDeclarations: true
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-CompactNamespaces: false
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
 Cpp11BracedListStyle: false
 DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^<ext/.*\.h>'
-    Priority:        2
-  - Regex:           '^<.*\.h>'
-    Priority:        1
-  - Regex:           '^<.*'
-    Priority:        2
-  - Regex:           '.*'
-    Priority:        3
-IncludeIsMainRegex: '([-_](test|unittest))?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: false
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Never
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 1
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakTemplateDeclaration: 10
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Right
-RawStringFormats:
-  - Language:        Cpp
-    Delimiters:
-      - cc
-      - CC
-      - cpp
-      - Cpp
-      - CPP
-      - 'c++'
-      - 'C++'
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-  - Language:        TextProto
-    Delimiters:
-      - pb
-      - PB
-      - proto
-      - PROTO
-    EnclosingFunctions:
-      - EqualsProto
-      - EquivToProto
-      - PARSE_PARTIAL_TEXT_PROTO
-      - PARSE_TEST_PROTO
-      - PARSE_TEXT_PROTO
-      - ParseTextOrDie
-      - ParseTextProtoOrDie
-    CanonicalDelimiter: ''
-    BasedOnStyle:    google
-ReflowComments:  true
 SortIncludes:    false
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 2
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Auto
-TabWidth:        8
-UseTab:          Never
-...
-
diff --git a/libvpx/.mailmap b/libvpx/.mailmap
index 376ca83ae..bb0ddd95b 100644
--- a/libvpx/.mailmap
+++ b/libvpx/.mailmap
@@ -21,10 +21,11 @@ Jacky Chen <jackychen@google.com>
 Jim Bankoski <jimbankoski@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johann.koenig@gmail.com>
 Johann Koenig <johannkoenig@google.com> <johannkoenig@chromium.org>
+Johann <johann@duck.com> <johann.koenig@gmail.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Konstantinos Margaritis <konma@vectorcamp.gr> <konstantinos@vectorcamp.gr>
 Marco Paniconi <marpan@google.com>
 Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Martin Storsjö <martin@martin.st>
diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index fffda6336..2db4a113e 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS
@@ -21,8 +21,10 @@ Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Andrew Lewis <andrewlewis@google.com>
 Andrew Russell <anrussell@google.com>
+Andrew Salkeld <andrew.salkeld@arm.com>
 Angie Chen <yunqi@google.com>
 Angie Chiang <angiebird@google.com>
+Anton Venema <anton.venema@liveswitch.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 Birk Magnussen <birk.magnussen@googlemail.com>
@@ -174,7 +176,9 @@ Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rsbultje@gmail.com>
 Rui Ueyama <ruiu@google.com>
 Sai Deng <sdeng@google.com>
+Salome Thirot <salome.thirot@arm.com>
 Sami Pietilä <samipietila@google.com>
+Sam James <sam@gentoo.org>
 Sarah Parker <sarahparker@google.com>
 Sasi Inguva <isasi@google.com>
 Scott Graham <scottmg@chromium.org>
diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index cd4e8ba43..3fb2d19bb 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG
@@ -1,3 +1,39 @@
+2023-01-31 v1.13.0 "Ugly Duckling"
+  This release includes more Neon and AVX2 optimizations, adds a new codec
+  control to set per frame QP, upgrades GoogleTest to v1.12.1, and includes
+  numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI incompatible with the previous release.
+
+    New codec control VP9E_SET_QUANTIZER_ONE_PASS to set per frame QP.
+
+    GoogleTest is upgraded to v1.12.1.
+
+    .clang-format is upgraded to clang-format-11.
+
+    VPX_EXT_RATECTRL_ABI_VERSION was bumped due to incompatible changes to the
+    feature of using external rate control models for vp9.
+
+  - Enhancement:
+    Numerous improvements on Neon optimizations.
+    Numerous improvements on AVX2 optimizations.
+    Additional ARM targets added for Visual Studio.
+
+  - Bug fixes:
+    Fix to calculating internal stats when frame dropped.
+    Fix to segfault for external resize test in vp9.
+    Fix to build system with replacing egrep with grep -E.
+    Fix to a few bugs with external RTC rate control library.
+    Fix to make SVC work with VBR.
+    Fix to key frame setting in VP9 external RC.
+    Fix to -Wimplicit-int (Clang 16).
+    Fix to VP8 external RC for buffer levels.
+    Fix to VP8 external RC for dynamic update of layers.
+    Fix to VP9 auto level.
+    Fix to off-by-one error of max w/h in validate_config.
+    Fix to make SVC work for Profile 1.
+
 2022-06-17 v1.12.0 "Torrent Duck"
   This release adds optimizations for Loongarch, adds support for vp8 in the
   real-time rate control library, upgrades GoogleTest to v1.11.0, updates
@@ -36,6 +72,7 @@
   levels, and includes several improvements to NEON and numerous bug fixes.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     New codec control is added to get quantization parameters and loop filter
     levels.
 
@@ -61,6 +98,7 @@
   well as numerous bug fixes.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     New codec control is added to disable loopfilter for VP9.
 
     New encoder control is added to disable feature to increase Q on overshoot
@@ -91,6 +129,7 @@
   well as incremental improvements.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     NV12 support is added to this release.
     A new interface is added for VP9 rate control. The new library libvp9rc.a
     must be linked by applications.
@@ -114,12 +153,14 @@
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI compatible with the previous release.
     ARCH_* defines have been removed in favor of VPX_ARCH_*.
 
 2019-07-15 v1.8.1 "Orpington Duck"
   This release collects incremental improvements to many aspects of the library.
 
   - Upgrading:
+    This release is ABI incompatible with the previous release.
     VP8E_SET_CPUUSED now accepts values up to 9 for vp9.
     VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT had a spelling fix (was VP8E).
     The --sdk-path option has been removed. If you were using it to build for
@@ -138,7 +179,8 @@
   This release focused on encoding performance for realtime and VOD use cases.
 
   - Upgrading:
-    This adds and improves several vp9 controls. Most are related to SVC:
+    This release is ABI incompatible with the previous release. This adds and
+    improves several vp9 controls. Most are related to SVC:
       VP9E_SET_SVC_FRAME_DROP_LAYER:
         - Frame dropping in SVC.
       VP9E_SET_SVC_INTER_LAYER_PRED:
diff --git a/libvpx/build/make/Android.mk b/libvpx/build/make/Android.mk
index b8032e67a..ba24f541b 100644
--- a/libvpx/build/make/Android.mk
+++ b/libvpx/build/make/Android.mk
@@ -8,6 +8,8 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 #
 # This file is to be used for compiling libvpx for Android using the NDK.
 # In an Android project place a libvpx checkout in the jni directory.
@@ -212,3 +214,4 @@ endif
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,android/cpufeatures)
 endif
+endif  # NDK_ROOT
diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index b7a873cc8..5c38c18e5 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile
@@ -21,9 +21,9 @@ all: .DEFAULT
 clean:: .DEFAULT
 exampletest: .DEFAULT
 install:: .DEFAULT
-test:: .DEFAULT
-test-no-data-check:: .DEFAULT
-testdata:: .DEFAULT
+test: .DEFAULT
+test-no-data-check: .DEFAULT
+testdata: .DEFAULT
 utiltest: .DEFAULT
 exampletest-no-data-check utiltest-no-data-check: .DEFAULT
 test_%: .DEFAULT ;
@@ -111,13 +111,13 @@ exampletest:
 .PHONY: install
 install::
 .PHONY: test
-test::
+test:
 .PHONY: testdata
-testdata::
+testdata:
 .PHONY: utiltest
 utiltest:
 .PHONY: test-no-data-check exampletest-no-data-check utiltest-no-data-check
-test-no-data-check::
+test-no-data-check:
 exampletest-no-data-check utiltest-no-data-check:
 
 # Force to realign stack always on OS/2
@@ -465,6 +465,6 @@ INSTALL_TARGETS += .install-docs .install-srcs .install-libs .install-bins
 all: $(BUILD_TARGETS)
 install:: $(INSTALL_TARGETS)
 dist: $(INSTALL_TARGETS)
-test::
+test:
 
 .SUFFIXES:  # Delete default suffix rules
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index 581042e38..4bf090f00 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -791,7 +791,7 @@ process_common_toolchain() {
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin2[0-1]*)
+      *darwin2[0-2]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
@@ -940,7 +940,7 @@ process_common_toolchain() {
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin2[0-1]-*)
+    *-darwin2[0-2]-*)
       add_cflags  "-arch ${toolchain%%-*}"
       add_ldflags "-arch ${toolchain%%-*}"
       ;;
@@ -1511,7 +1511,7 @@ EOF
 
     # Try to find which inline keywords are supported
     check_cc <<EOF && INLINE="inline"
-static inline function() {}
+static inline int function(void) {}
 EOF
 
   # Almost every platform uses pthreads.
diff --git a/libvpx/build/make/gen_asm_deps.sh b/libvpx/build/make/gen_asm_deps.sh
index 6a7bff9eb..3bd4d125f 100755
--- a/libvpx/build/make/gen_asm_deps.sh
+++ b/libvpx/build/make/gen_asm_deps.sh
@@ -42,7 +42,7 @@ done
 
 [ -n "$srcfile" ] || show_help
 sfx=${sfx:-asm}
-includes=$(LC_ALL=C egrep -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
+includes=$(LC_ALL=C grep -E -i "include +\"?[a-z0-9_/]+\.${sfx}" $srcfile |
            perl -p -e "s;.*?([a-z0-9_/]+.${sfx}).*;\1;")
 #" restore editor state
 for inc in ${includes}; do
diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl
index 9c9726842..f4edeaad5 100755
--- a/libvpx/build/make/rtcd.pl
+++ b/libvpx/build/make/rtcd.pl
@@ -488,7 +488,8 @@ if ($opts{arch} eq 'x86') {
   arm;
 } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) {
   @ALL_ARCHS = filter(qw/neon/);
-  &require("neon");
+  @REQUIRES = filter(qw/neon/);
+  &require(@REQUIRES);
   arm;
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);
diff --git a/libvpx/configure b/libvpx/configure
index beea65032..ae289f77b 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -101,9 +101,12 @@ all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
 all_platforms="${all_platforms} arm64-darwin21-gcc"
+all_platforms="${all_platforms} arm64-darwin22-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
+all_platforms="${all_platforms} arm64-win64-vs16"
+all_platforms="${all_platforms} arm64-win64-vs17"
 all_platforms="${all_platforms} armv7-android-gcc"   #neon Cortex-A8
 all_platforms="${all_platforms} armv7-darwin-gcc"    #neon Cortex-A8
 all_platforms="${all_platforms} armv7-linux-rvct"    #neon Cortex-A8
@@ -112,6 +115,8 @@ all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-gcc"
 all_platforms="${all_platforms} armv7-win32-vs14"
 all_platforms="${all_platforms} armv7-win32-vs15"
+all_platforms="${all_platforms} armv7-win32-vs16"
+all_platforms="${all_platforms} armv7-win32-vs17"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
 all_platforms="${all_platforms} loongarch32-linux-gcc"
@@ -157,6 +162,7 @@ all_platforms="${all_platforms} x86_64-darwin18-gcc"
 all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
 all_platforms="${all_platforms} x86_64-darwin21-gcc"
+all_platforms="${all_platforms} x86_64-darwin22-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -666,11 +672,18 @@ process_toolchain() {
           check_add_cxxflags -Wno-psabi
         fi
 
+        # Enforce C++11 compatibility.
+        check_add_cxxflags -Wc++14-extensions
+        check_add_cxxflags -Wc++17-extensions
+        check_add_cxxflags -Wc++20-extensions
+
         # disable some warnings specific to libyuv.
         check_cxxflags -Wno-missing-declarations \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-declarations"
         check_cxxflags -Wno-missing-prototypes \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-missing-prototypes"
+        check_cxxflags -Wno-pass-failed \
+          && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-pass-failed"
         check_cxxflags -Wno-unused-parameter \
           && LIBYUV_CXXFLAGS="${LIBYUV_CXXFLAGS} -Wno-unused-parameter"
     fi
diff --git a/libvpx/examples/svc_encodeframe.c b/libvpx/examples/svc_encodeframe.c
index 08bda0e5c..003096e70 100644
--- a/libvpx/examples/svc_encodeframe.c
+++ b/libvpx/examples/svc_encodeframe.c
@@ -552,11 +552,8 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
   iter = NULL;
   while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
     switch (cx_pkt->kind) {
-      case VPX_CODEC_PSNR_PKT: {
-      }
-        ++si->psnr_pkt_received;
-        break;
-      default: { break; }
+      case VPX_CODEC_PSNR_PKT: ++si->psnr_pkt_received; break;
+      default: break;
     }
   }
 
diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index e85dbf8e7..d287e5831 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -1146,7 +1146,9 @@ int main(int argc, const char **argv) {
                       cx_pkt->data.twopass_stats.sz);
           break;
         }
-        default: { break; }
+        default: {
+          break;
+        }
       }
 
 #if CONFIG_VP9_DECODER && !SIMULCAST_MODE
diff --git a/libvpx/libs.doxy_template b/libvpx/libs.doxy_template
index 1eacc8fe2..1ee442af3 100644
--- a/libvpx/libs.doxy_template
+++ b/libvpx/libs.doxy_template
@@ -654,12 +654,6 @@ VERBATIM_HEADERS       = YES
 
 ALPHABETICAL_INDEX     = NO
 
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
 # In case all classes in a project start with a common prefix, all
 # classes will be put under the same header in the alphabetical index.
 # The IGNORE_PREFIX tag can be used to specify one or more prefixes that
@@ -1099,32 +1093,10 @@ ALLEXTERNALS           = NO
 
 EXTERNAL_GROUPS        = YES
 
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              = /usr/bin/perl
-
 #---------------------------------------------------------------------------
 # Configuration options related to the dot tool
 #---------------------------------------------------------------------------
 
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and la_te_x) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option is superseded by the HAVE_DOT option below. This is only a
-# fallback. It is recommended to install and use dot, since it yields more
-# powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see http://www.mcternan.me.uk/mscgen/) to
-# produce the chart and insert it in the documentation. The MSCGEN_PATH tag allows you to
-# specify the directory where the mscgen tool resides. If left empty the tool is assumed to
-# be found in the default search path.
-
-MSCGEN_PATH            =
-
 # If set to YES, the inheritance and collaboration graphs will hide
 # inheritance and usage relations if the target is undocumented
 # or is not a class.
@@ -1138,10 +1110,14 @@ HIDE_UNDOC_RELATIONS   = YES
 
 HAVE_DOT               = NO
 
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# the CLASS_DIAGRAMS tag to NO.
+# If the CLASS_GRAPH tag is set to YES (or GRAPH) then doxygen will generate a
+# graph for each documented class showing the direct and indirect inheritance
+# relations. In case HAVE_DOT is set as well dot will be used to draw the graph,
+# otherwise the built-in generator will be used. If the CLASS_GRAPH tag is set
+# to TEXT the direct and indirect inheritance relations will be shown as texts /
+# links.
+# Possible values are: NO, YES, TEXT and GRAPH.
+# The default value is: YES.
 
 CLASS_GRAPH            = YES
 
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index 00e49a19d..1f7f03aa3 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -312,8 +312,8 @@ $(BUILD_PFX)libvpx_g.a: $(LIBVPX_OBJS)
 # To determine SO_VERSION_{MAJOR,MINOR,PATCH}, calculate c,a,r with current
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
-SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 1
+SO_VERSION_MAJOR := 8
+SO_VERSION_MINOR := 0
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -446,13 +446,13 @@ ifeq ($(VPX_ARCH_X86)$(VPX_ARCH_X86_64),yes)
 # YASM
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " equ " $$3}' > $@
 else
 ADS2GAS=$(if $(filter yes,$(CONFIG_GCC)),| $(ASM_CONVERSION))
 $(BUILD_PFX)vpx_config.asm: $(BUILD_PFX)vpx_config.h
 	@echo "    [CREATE] $@"
-	@LC_ALL=C egrep "#define [A-Z0-9_]+ [01]" $< \
+	@LC_ALL=C grep -E "#define [A-Z0-9_]+ [01]" $< \
 	    | awk '{print $$2 " EQU " $$3}' $(ADS2GAS) > $@
 	@echo "        END" $(ADS2GAS) >> $@
 CLEAN-OBJS += $(BUILD_PFX)vpx_config.asm
@@ -536,7 +536,7 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
 	  esac \
 	)
 
-testdata:: $(LIBVPX_TEST_DATA)
+testdata: $(LIBVPX_TEST_DATA)
 	$(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
           [ -x "$$(which shasum)" ] && sha1sum=shasum;\
           [ -x "$$(which sha1)" ] && sha1sum=sha1;\
@@ -709,15 +709,15 @@ INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(TEST_INTRA_PRED_SPEED_SRCS)
 INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(RC_INTERFACE_TEST_SRCS)
 
 define test_shard_template
-test:: test_shard.$(1)
-test-no-data-check:: test_shard_ndc.$(1)
+test: test_shard.$(1)
+test-no-data-check: test_shard_ndc.$(1)
 test_shard.$(1) test_shard_ndc.$(1): $(LIBVPX_TEST_BIN)
 	@set -e; \
 	 export GTEST_SHARD_INDEX=$(1); \
 	 export GTEST_TOTAL_SHARDS=$(2); \
 	 $(LIBVPX_TEST_BIN)
 test_shard.$(1): testdata
-.PHONY: test_shard.$(1)
+.PHONY: test_shard.$(1) test_shard_ndc.$(1)
 endef
 
 NUM_SHARDS := 10
diff --git a/libvpx/md5_utils.c b/libvpx/md5_utils.c
index c4106525f..abd8d43c3 100644
--- a/libvpx/md5_utils.c
+++ b/libvpx/md5_utils.c
@@ -151,8 +151,8 @@ void MD5Final(md5byte digest[16], struct MD5Context *ctx) {
  * reflect the addition of 16 longwords of new data.  MD5Update blocks
  * the data and converts bytes into longwords for this routine.
  */
-VPX_NO_UNSIGNED_OVERFLOW_CHECK void MD5Transform(UWORD32 buf[4],
-                                                 UWORD32 const in[16]) {
+VPX_NO_UNSIGNED_OVERFLOW_CHECK VPX_NO_UNSIGNED_SHIFT_CHECK void MD5Transform(
+    UWORD32 buf[4], UWORD32 const in[16]) {
   UWORD32 a, b, c, d;
 
   a = buf[0];
diff --git a/libvpx/test/acm_random.h b/libvpx/test/acm_random.h
index 3458340a1..c7122b933 100644
--- a/libvpx/test/acm_random.h
+++ b/libvpx/test/acm_random.h
@@ -28,43 +28,43 @@ class ACMRandom {
   explicit ACMRandom(int seed) : random_(seed) {}
 
   void Reset(int seed) { random_.Reseed(seed); }
-  uint16_t Rand16(void) {
+  uint16_t Rand16() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     return (value >> 15) & 0xffff;
   }
 
-  int32_t Rand20Signed(void) {
+  int32_t Rand20Signed() {
     // Use 20 bits: values between 524287 and -524288.
     const uint32_t value = random_.Generate(1048576);
     return static_cast<int32_t>(value) - 524288;
   }
 
-  int16_t Rand16Signed(void) {
+  int16_t Rand16Signed() {
     // Use 16 bits: values between 32767 and -32768.
     return static_cast<int16_t>(random_.Generate(65536));
   }
 
-  int16_t Rand13Signed(void) {
+  int16_t Rand13Signed() {
     // Use 13 bits: values between 4095 and -4096.
     const uint32_t value = random_.Generate(8192);
     return static_cast<int16_t>(value) - 4096;
   }
 
-  int16_t Rand9Signed(void) {
+  int16_t Rand9Signed() {
     // Use 9 bits: values between 255 (0x0FF) and -256 (0x100).
     const uint32_t value = random_.Generate(512);
     return static_cast<int16_t>(value) - 256;
   }
 
-  uint8_t Rand8(void) {
+  uint8_t Rand8() {
     const uint32_t value =
         random_.Generate(testing::internal::Random::kMaxRange);
     // There's a bit more entropy in the upper bits of this implementation.
     return (value >> 23) & 0xff;
   }
 
-  uint8_t Rand8Extremes(void) {
+  uint8_t Rand8Extremes() {
     // Returns a random value near 0 or near 255, to better exercise
     // saturation behavior.
     const uint8_t r = Rand8();
@@ -82,7 +82,7 @@ class ACMRandom {
 
   int operator()(int n) { return PseudoUniform(n); }
 
-  static int DeterministicSeed(void) { return 0xbaba; }
+  static int DeterministicSeed() { return 0xbaba; }
 
  private:
   testing::internal::Random random_;
diff --git a/libvpx/test/android/Android.mk b/libvpx/test/android/Android.mk
index 87155fcb5..9a7533ebb 100644
--- a/libvpx/test/android/Android.mk
+++ b/libvpx/test/android/Android.mk
@@ -10,6 +10,9 @@
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libvpx make file
 # expects to be made from a parent directory.
+
+# Ignore this file during non-NDK builds.
+ifdef NDK_ROOT
 CUR_WD := $(call my-dir)
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
@@ -61,3 +64,4 @@ LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
 # some test files depend on *_rtcd.h, ensure they're generated first.
 $(eval $(call rtcd_dep_template))
 include $(BUILD_EXECUTABLE)
+endif  # NDK_ROOT
diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc
index 3977a2d0b..70aeab8d7 100644
--- a/libvpx/test/comp_avg_pred_test.cc
+++ b/libvpx/test/comp_avg_pred_test.cc
@@ -22,13 +22,14 @@ namespace {
 using ::libvpx_test::ACMRandom;
 using ::libvpx_test::Buffer;
 
-typedef void (*AvgPredFunc)(uint8_t *a, const uint8_t *b, int w, int h,
-                            const uint8_t *c, int c_stride);
-
-uint8_t avg_with_rounding(uint8_t a, uint8_t b) { return (a + b + 1) >> 1; }
+template <typename Pixel>
+Pixel avg_with_rounding(Pixel a, Pixel b) {
+  return (a + b + 1) >> 1;
+}
 
-void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
-                    int width, int height, Buffer<uint8_t> *avg) {
+template <typename Pixel>
+void reference_pred(const Buffer<Pixel> &pred, const Buffer<Pixel> &ref,
+                    int width, int height, Buffer<Pixel> *avg) {
   ASSERT_NE(avg->TopLeftPixel(), nullptr);
   ASSERT_NE(pred.TopLeftPixel(), nullptr);
   ASSERT_NE(ref.TopLeftPixel(), nullptr);
@@ -36,12 +37,16 @@ void reference_pred(const Buffer<uint8_t> &pred, const Buffer<uint8_t> &ref,
   for (int y = 0; y < height; ++y) {
     for (int x = 0; x < width; ++x) {
       avg->TopLeftPixel()[y * avg->stride() + x] =
-          avg_with_rounding(pred.TopLeftPixel()[y * pred.stride() + x],
-                            ref.TopLeftPixel()[y * ref.stride() + x]);
+          avg_with_rounding<Pixel>(pred.TopLeftPixel()[y * pred.stride() + x],
+                                   ref.TopLeftPixel()[y * ref.stride() + x]);
     }
   }
 }
 
+using AvgPredFunc = void (*)(uint8_t *a, const uint8_t *b, int w, int h,
+                             const uint8_t *c, int c_stride);
+
+template <int bitdepth, typename Pixel>
 class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
  public:
   virtual void SetUp() {
@@ -49,15 +54,19 @@ class AvgPredTest : public ::testing::TestWithParam<AvgPredFunc> {
     rnd_.Reset(ACMRandom::DeterministicSeed());
   }
 
+  void TestSizeCombinations();
+  void TestCompareReferenceRandom();
+  void TestSpeed();
+
  protected:
   AvgPredFunc avg_pred_func_;
   ACMRandom rnd_;
 };
 
-TEST_P(AvgPredTest, SizeCombinations) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSizeCombinations() {
   // This is called as part of the sub pixel variance. As such it must be one of
   // the variance block sizes.
-
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -70,23 +79,30 @@ TEST_P(AvgPredTest, SizeCombinations) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
         // Only the reference buffer may have a stride not equal to width.
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg_ref.Init());
-        Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg_chk.Init());
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
-
-        reference_pred(pred, ref, width, height, &avg_ref);
-        ASM_REGISTER_STATE_CHECK(
-            avg_pred_func_(avg_chk.TopLeftPixel(), pred.TopLeftPixel(), width,
-                           height, ref.TopLeftPixel(), ref.stride()));
+        reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+        ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+            (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+            width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
 
         EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
         if (HasFailure()) {
@@ -99,26 +115,36 @@ TEST_P(AvgPredTest, SizeCombinations) {
   }
 }
 
-TEST_P(AvgPredTest, CompareReferenceRandom) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestCompareReferenceRandom() {
   const int width = 64;
   const int height = 32;
-  Buffer<uint8_t> ref = Buffer<uint8_t>(width, height, 8);
+  Buffer<Pixel> ref = Buffer<Pixel>(width, height, 8);
   ASSERT_TRUE(ref.Init());
-  Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(pred.Init());
-  Buffer<uint8_t> avg_ref = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_ref = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(avg_ref.Init());
-  Buffer<uint8_t> avg_chk = Buffer<uint8_t>(width, height, 0, 16);
+  Buffer<Pixel> avg_chk = Buffer<Pixel>(width, height, 0, 16);
   ASSERT_TRUE(avg_chk.Init());
 
   for (int i = 0; i < 500; ++i) {
-    ref.Set(&rnd_, &ACMRandom::Rand8);
-    pred.Set(&rnd_, &ACMRandom::Rand8);
+    const int bitdepth_mask = (1 << bitdepth) - 1;
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
+    for (int h = 0; h < height; ++h) {
+      for (int w = 0; w < width; ++w) {
+        pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+      }
+    }
 
-    reference_pred(pred, ref, width, height, &avg_ref);
-    ASM_REGISTER_STATE_CHECK(avg_pred_func_(avg_chk.TopLeftPixel(),
-                                            pred.TopLeftPixel(), width, height,
-                                            ref.TopLeftPixel(), ref.stride()));
+    reference_pred<Pixel>(pred, ref, width, height, &avg_ref);
+    ASM_REGISTER_STATE_CHECK(avg_pred_func_(
+        (uint8_t *)avg_chk.TopLeftPixel(), (uint8_t *)pred.TopLeftPixel(),
+        width, height, (uint8_t *)ref.TopLeftPixel(), ref.stride()));
     EXPECT_TRUE(avg_chk.CheckValues(avg_ref));
     if (HasFailure()) {
       printf("Width: %d Height: %d\n", width, height);
@@ -128,7 +154,8 @@ TEST_P(AvgPredTest, CompareReferenceRandom) {
   }
 }
 
-TEST_P(AvgPredTest, DISABLED_Speed) {
+template <int bitdepth, typename Pixel>
+void AvgPredTest<bitdepth, Pixel>::TestSpeed() {
   for (int width_pow = 2; width_pow <= 6; ++width_pow) {
     for (int height_pow = width_pow - 1; height_pow <= width_pow + 1;
          ++height_pow) {
@@ -138,22 +165,30 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
       for (int ref_padding = 0; ref_padding < 2; ref_padding++) {
         const int width = 1 << width_pow;
         const int height = 1 << height_pow;
-        Buffer<uint8_t> ref =
-            Buffer<uint8_t>(width, height, ref_padding ? 8 : 0);
+        Buffer<Pixel> ref = Buffer<Pixel>(width, height, ref_padding ? 8 : 0);
         ASSERT_TRUE(ref.Init());
-        Buffer<uint8_t> pred = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> pred = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(pred.Init());
-        Buffer<uint8_t> avg = Buffer<uint8_t>(width, height, 0, 16);
+        Buffer<Pixel> avg = Buffer<Pixel>(width, height, 0, 16);
         ASSERT_TRUE(avg.Init());
-
-        ref.Set(&rnd_, &ACMRandom::Rand8);
-        pred.Set(&rnd_, &ACMRandom::Rand8);
+        const int bitdepth_mask = (1 << bitdepth) - 1;
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            ref.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
+        for (int h = 0; h < height; ++h) {
+          for (int w = 0; w < width; ++w) {
+            pred.TopLeftPixel()[w + h * width] = rnd_.Rand16() & bitdepth_mask;
+          }
+        }
 
         vpx_usec_timer timer;
         vpx_usec_timer_start(&timer);
-        for (int i = 0; i < 10000000 / (width * height); ++i) {
-          avg_pred_func_(avg.TopLeftPixel(), pred.TopLeftPixel(), width, height,
-                         ref.TopLeftPixel(), ref.stride());
+        for (int i = 0; i < 100000000 / (width * height); ++i) {
+          avg_pred_func_((uint8_t *)avg.TopLeftPixel(),
+                         (uint8_t *)pred.TopLeftPixel(), width, height,
+                         (uint8_t *)ref.TopLeftPixel(), ref.stride());
         }
         vpx_usec_timer_mark(&timer);
 
@@ -166,26 +201,64 @@ TEST_P(AvgPredTest, DISABLED_Speed) {
   }
 }
 
-INSTANTIATE_TEST_SUITE_P(C, AvgPredTest,
+using AvgPredTestLBD = AvgPredTest<8, uint8_t>;
+
+TEST_P(AvgPredTestLBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestLBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestLBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(C, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_c));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(SSE2, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_sse2));
 #endif  // HAVE_SSE2
 
 #if HAVE_NEON
-INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(NEON, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_neon));
 #endif  // HAVE_NEON
 
 #if HAVE_VSX
-INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
 #endif  // HAVE_VSX
 
 #if HAVE_LSX
-INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest,
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTestLBD,
                          ::testing::Values(&vpx_comp_avg_pred_lsx));
 #endif  // HAVE_LSX
+
+#if CONFIG_VP9_HIGHBITDEPTH
+using HighbdAvgPredFunc = void (*)(uint16_t *a, const uint16_t *b, int w, int h,
+                                   const uint16_t *c, int c_stride);
+
+template <HighbdAvgPredFunc fn>
+void highbd_wrapper(uint8_t *a, const uint8_t *b, int w, int h,
+                    const uint8_t *c, int c_stride) {
+  fn((uint16_t *)a, (const uint16_t *)b, w, h, (const uint16_t *)c, c_stride);
+}
+
+using AvgPredTestHBD = AvgPredTest<12, uint16_t>;
+
+TEST_P(AvgPredTestHBD, SizeCombinations) { TestSizeCombinations(); }
+
+TEST_P(AvgPredTestHBD, CompareReferenceRandom) { TestCompareReferenceRandom(); }
+
+TEST_P(AvgPredTestHBD, DISABLED_Speed) { TestSpeed(); }
+
+INSTANTIATE_TEST_SUITE_P(
+    C, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_c>));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_SUITE_P(
+    SSE2, AvgPredTestHBD,
+    ::testing::Values(&highbd_wrapper<vpx_highbd_comp_avg_pred_sse2>));
+#endif  // HAVE_SSE2
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index 06837d809..d4ef7ae13 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -789,13 +789,23 @@ INSTANTIATE_TEST_SUITE_P(
         make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if HAVE_NEON && !CONFIG_EMULATE_HARDWARE
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     NEON, Trans16x16DCT,
     ::testing::Values(make_tuple(&vpx_fdct16x16_neon,
                                  &vpx_idct16x16_256_add_neon, 0, VPX_BITS_8)));
 #endif  // HAVE_NEON && !CONFIG_EMULATE_HARDWARE
 
+#if HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vpx_highbd_fdct16x16_neon, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vpx_fdct16x16_neon, &vpx_idct16x16_256_add_c, 0,
+                   VPX_BITS_8)));
+#endif  // HAVE_NEON && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_SUITE_P(
     SSE2, Trans16x16DCT,
diff --git a/libvpx/test/dct_partial_test.cc b/libvpx/test/dct_partial_test.cc
index 8d0e3a912..e57fa0f48 100644
--- a/libvpx/test/dct_partial_test.cc
+++ b/libvpx/test/dct_partial_test.cc
@@ -145,11 +145,17 @@ INSTANTIATE_TEST_SUITE_P(
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     NEON, PartialFdctTest,
-    ::testing::Values(make_tuple(&vpx_fdct32x32_1_neon, 32, VPX_BITS_8),
-                      make_tuple(&vpx_fdct16x16_1_neon, 16, VPX_BITS_8),
+    ::testing::Values(make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct32x32_1_neon, 32, VPX_BITS_8),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_12),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_10),
+                      make_tuple(&vpx_highbd_fdct16x16_1_neon, 16, VPX_BITS_8),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_12),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_10),
                       make_tuple(&vpx_fdct8x8_1_neon, 8, VPX_BITS_8),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_12),
+                      make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_10),
                       make_tuple(&vpx_fdct4x4_1_neon, 4, VPX_BITS_8)));
 #else
 INSTANTIATE_TEST_SUITE_P(
diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc
index 2182f87e5..0304029bd 100644
--- a/libvpx/test/dct_test.cc
+++ b/libvpx/test/dct_test.cc
@@ -539,6 +539,18 @@ INSTANTIATE_TEST_SUITE_P(AVX2, TransDCT,
 #endif  // HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_neon_func_info[] = {
+  { &fdct_wrapper<vpx_highbd_fdct4x4_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct4x4_16_add_neon>, 4, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct8x8_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct8x8_64_add_neon>, 8, 2 },
+  { &fdct_wrapper<vpx_highbd_fdct16x16_neon>,
+    &highbd_idct_wrapper<vpx_highbd_idct16x16_256_add_neon>, 16, 2 },
+  /* { &fdct_wrapper<vpx_highbd_fdct32x32_neon>,
+       &highbd_idct_wrapper<vpx_highbd_idct32x32_1024_add_neon>, 32, 2 },*/
+};
+#else
 static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct4x4_neon>, &idct_wrapper<vpx_idct4x4_16_add_neon>, 4,
     1 },
@@ -549,12 +561,15 @@ static const FuncInfo dct_neon_func_info[4] = {
   { &fdct_wrapper<vpx_fdct32x32_neon>,
     &idct_wrapper<vpx_idct32x32_1024_add_neon>, 32, 1 }
 };
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 INSTANTIATE_TEST_SUITE_P(
     NEON, TransDCT,
-    ::testing::Combine(::testing::Range(0, 4),
-                       ::testing::Values(dct_neon_func_info),
-                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+    ::testing::Combine(
+        ::testing::Range(0, static_cast<int>(sizeof(dct_neon_func_info) /
+                                             sizeof(dct_neon_func_info[0]))),
+        ::testing::Values(dct_neon_func_info), ::testing::Values(0),
+        ::testing::Values(VPX_BITS_8, VPX_BITS_10, VPX_BITS_12)));
 #endif  // HAVE_NEON
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
@@ -652,6 +667,8 @@ static const FuncInfo ht_neon_func_info[] = {
 #if CONFIG_VP9_HIGHBITDEPTH
   { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>, 4,
     2 },
+  { &vp9_highbd_fht4x4_neon, &highbd_iht_wrapper<vp9_highbd_iht4x4_16_add_neon>,
+    4, 2 },
   { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper<vp9_highbd_iht8x8_64_add_neon>, 8,
     2 },
   { &vp9_highbd_fht16x16_c,
diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc
index 6f61c7750..ecdf92834 100644
--- a/libvpx/test/encode_api_test.cc
+++ b/libvpx/test/encode_api_test.cc
@@ -233,8 +233,8 @@ TEST(EncodeAPI, SetRoi) {
     roi.roi_map = roi_map;
     // VP8 only. This value isn't range checked.
     roi.static_threshold[1] = 1000;
-    roi.static_threshold[2] = INT_MIN;
-    roi.static_threshold[3] = INT_MAX;
+    roi.static_threshold[2] = UINT_MAX / 2 + 1;
+    roi.static_threshold[3] = UINT_MAX;
 
     for (const auto delta : { -63, -1, 0, 1, 63 }) {
       for (int i = 0; i < 8; ++i) {
@@ -336,7 +336,7 @@ TEST(EncodeAPI, ConfigChangeThreadCount) {
   for (const auto *iface : kCodecIfaces) {
     SCOPED_TRACE(vpx_codec_iface_name(iface));
     for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
-      vpx_codec_enc_cfg_t cfg;
+      vpx_codec_enc_cfg_t cfg = {};
       struct Encoder {
         ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
         vpx_codec_ctx_t ctx = {};
diff --git a/libvpx/test/encode_test_driver.cc b/libvpx/test/encode_test_driver.cc
index 1ce39eaef..d3feeee34 100644
--- a/libvpx/test/encode_test_driver.cc
+++ b/libvpx/test/encode_test_driver.cc
@@ -52,7 +52,8 @@ void Encoder::InitEncoder(VideoSource *video) {
   }
 }
 
-void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
+void Encoder::EncodeFrame(VideoSource *video,
+                          const vpx_enc_frame_flags_t frame_flags) {
   if (video->img()) {
     EncodeFrameInternal(*video, frame_flags);
   } else {
@@ -70,7 +71,7 @@ void Encoder::EncodeFrame(VideoSource *video, const unsigned long frame_flags) {
 }
 
 void Encoder::EncodeFrameInternal(const VideoSource &video,
-                                  const unsigned long frame_flags) {
+                                  const vpx_enc_frame_flags_t frame_flags) {
   vpx_codec_err_t res;
   const vpx_image_t *img = video.img();
 
@@ -169,7 +170,7 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
   ASSERT_TRUE(passes_ == 1 || passes_ == 2);
   for (unsigned int pass = 0; pass < passes_; pass++) {
-    last_pts_ = 0;
+    vpx_codec_pts_t last_pts = 0;
 
     if (passes_ == 1) {
       cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -225,8 +226,8 @@ void EncoderTest::RunLoop(VideoSource *video) {
 
               has_dxdata = true;
             }
-            ASSERT_GE(pkt->data.frame.pts, last_pts_);
-            last_pts_ = pkt->data.frame.pts;
+            ASSERT_GE(pkt->data.frame.pts, last_pts);
+            last_pts = pkt->data.frame.pts;
             FramePktHook(pkt);
             break;
 
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 7085945f6..b57df8529 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -103,7 +103,7 @@ class Encoder {
   }
   // This is a thin wrapper around vpx_codec_encode(), so refer to
   // vpx_encoder.h for its semantics.
-  void EncodeFrame(VideoSource *video, const unsigned long frame_flags);
+  void EncodeFrame(VideoSource *video, vpx_enc_frame_flags_t frame_flags);
 
   // Convenience wrapper for EncodeFrame()
   void EncodeFrame(VideoSource *video) { EncodeFrame(video, 0); }
@@ -184,7 +184,7 @@ class Encoder {
 
   // Encode an image
   void EncodeFrameInternal(const VideoSource &video,
-                           const unsigned long frame_flags);
+                           vpx_enc_frame_flags_t frame_flags);
 
   // Flush the encoder on EOS
   void Flush();
@@ -206,8 +206,7 @@ class Encoder {
 class EncoderTest {
  protected:
   explicit EncoderTest(const CodecFactory *codec)
-      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {
+      : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0) {
     // Default to 1 thread.
     cfg_.g_threads = 1;
   }
@@ -290,8 +289,7 @@ class EncoderTest {
   unsigned long deadline_;
   TwopassStatsStore stats_;
   unsigned long init_flags_;
-  unsigned long frame_flags_;
-  vpx_codec_pts_t last_pts_;
+  vpx_enc_frame_flags_t frame_flags_;
 };
 
 }  // namespace libvpx_test
diff --git a/libvpx/test/error_resilience_test.cc b/libvpx/test/error_resilience_test.cc
index 45a327ec2..45138f14b 100644
--- a/libvpx/test/error_resilience_test.cc
+++ b/libvpx/test/error_resilience_test.cc
@@ -496,7 +496,7 @@ class ErrorResilienceTestLargeCodecControls
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     duration_ = (last_pts_ + 1) * timebase_;
     if (cfg_.ts_number_layers > 1) {
       for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
diff --git a/libvpx/test/frame_size_tests.cc b/libvpx/test/frame_size_tests.cc
index d85c193e0..8a0eb71ba 100644
--- a/libvpx/test/frame_size_tests.cc
+++ b/libvpx/test/frame_size_tests.cc
@@ -111,7 +111,7 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
 
     ASSERT_TRUE(passes_ == 1 || passes_ == 2);
     for (unsigned int pass = 0; pass < passes_; pass++) {
-      last_pts_ = 0;
+      vpx_codec_pts_t last_pts = 0;
 
       if (passes_ == 1) {
         cfg_.g_pass = VPX_RC_ONE_PASS;
@@ -144,8 +144,8 @@ class VP9FrameSizeTestsLarge : public ::libvpx_test::EncoderTest,
           again = true;
           switch (pkt->kind) {
             case VPX_CODEC_CX_FRAME_PKT:
-              ASSERT_GE(pkt->data.frame.pts, last_pts_);
-              last_pts_ = pkt->data.frame.pts;
+              ASSERT_GE(pkt->data.frame.pts, last_pts);
+              last_pts = pkt->data.frame.pts;
               FramePktHook(pkt);
               break;
 
diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc
index 10b1e79c1..f904e814a 100644
--- a/libvpx/test/hadamard_test.cc
+++ b/libvpx/test/hadamard_test.cc
@@ -264,7 +264,8 @@ INSTANTIATE_TEST_SUITE_P(
 INSTANTIATE_TEST_SUITE_P(
     NEON, HadamardLowbdTest,
     ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_neon, 8),
-                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16)));
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_neon, 16),
+                      HadamardFuncWithSize(&vpx_hadamard_32x32_neon, 32)));
 #endif  // HAVE_NEON
 
 // TODO(jingning): Remove highbitdepth flag when the SIMD functions are
diff --git a/libvpx/test/md5_helper.h b/libvpx/test/md5_helper.h
index dc28dc628..9095d96a8 100644
--- a/libvpx/test/md5_helper.h
+++ b/libvpx/test/md5_helper.h
@@ -47,7 +47,7 @@ class MD5 {
     MD5Update(&md5_, data, static_cast<uint32_t>(size));
   }
 
-  const char *Get(void) {
+  const char *Get() {
     static const char hex[16] = {
       '0', '1', '2', '3', '4', '5', '6', '7',
       '8', '9', 'a', 'b', 'c', 'd', 'e', 'f',
diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index 775f7f36a..27d5ffa90 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc
@@ -7,7 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include <limits.h>
+
+#include <memory>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
@@ -458,14 +462,13 @@ TEST_P(VpxMbPostProcDownTest, CheckLowFilterOutput) {
 
   SetRows(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride());
 
-  unsigned char *expected_output = new unsigned char[rows_ * cols_];
+  std::unique_ptr<unsigned char[]> expected_output(
+      new unsigned char[rows_ * cols_]);
   ASSERT_NE(expected_output, nullptr);
-  SetRows(expected_output, rows_, cols_, cols_);
+  SetRows(expected_output.get(), rows_, cols_, cols_);
 
   RunFilterLevel(src_c_.TopLeftPixel(), rows_, cols_, src_c_.stride(), q2mbl(0),
-                 expected_output);
-
-  delete[] expected_output;
+                 expected_output.get());
 }
 
 TEST_P(VpxMbPostProcDownTest, CheckCvsAssembly) {
diff --git a/libvpx/test/resize_test.cc b/libvpx/test/resize_test.cc
index c57170ff9..715bb9d70 100644
--- a/libvpx/test/resize_test.cc
+++ b/libvpx/test/resize_test.cc
@@ -95,10 +95,11 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
                          unsigned int initial_h, unsigned int *w,
                          unsigned int *h, bool flag_codec,
                          bool smaller_width_larger_size_) {
+  *w = initial_w;
+  *h = initial_h;
+
   if (smaller_width_larger_size_) {
     if (frame < 30) {
-      *w = initial_w;
-      *h = initial_h;
       return;
     }
     if (frame < 100) {
@@ -109,8 +110,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 10) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 20) {
@@ -124,8 +123,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 40) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 50) {
@@ -139,8 +136,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 70) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 80) {
@@ -159,8 +154,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 110) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 120) {
@@ -179,8 +172,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 150) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 160) {
@@ -199,8 +190,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 190) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 200) {
@@ -219,8 +208,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 230) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   if (frame < 240) {
@@ -234,8 +221,6 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     return;
   }
   if (frame < 260) {
-    *w = initial_w;
-    *h = initial_h;
     return;
   }
   // Go down very low.
@@ -248,13 +233,9 @@ void ScaleForFrameNumber(unsigned int frame, unsigned int initial_w,
     // Cases that only works for VP9.
     // For VP9: Swap width and height of original.
     if (frame < 320) {
-      *w = initial_h;
-      *h = initial_w;
       return;
     }
   }
-  *w = initial_w;
-  *h = initial_h;
 }
 
 class ResizingVideoSource : public ::libvpx_test::DummyVideoSource {
@@ -578,6 +559,8 @@ TEST_P(ResizeRealtimeTest, TestExternalResizeWorks) {
   }
 }
 
+// TODO(https://crbug.com/webm/1642): This causes a segfault in
+// init_encode_frame_mb_context().
 TEST_P(ResizeRealtimeTest, DISABLED_TestExternalResizeSmallerWidthBiggerSize) {
   ResizingVideoSource video;
   video.flag_codec_ = true;
@@ -794,8 +777,7 @@ TEST_P(ResizeCspTest, TestResizeCspWorks) {
 }
 
 VP8_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
-VP9_INSTANTIATE_TEST_SUITE(ResizeTest,
-                           ::testing::Values(::libvpx_test::kRealTime));
+VP9_INSTANTIATE_TEST_SUITE(ResizeTest, ONE_PASS_TEST_MODES);
 VP9_INSTANTIATE_TEST_SUITE(ResizeInternalTest,
                            ::testing::Values(::libvpx_test::kOnePassBest));
 VP9_INSTANTIATE_TEST_SUITE(ResizeRealtimeTest,
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index 2506f1adb..0896c77f1 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -311,13 +311,13 @@ class SADTest : public AbstractBench, public SADTestBase<SadMxNParam> {
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void Run() {
+  void Run() override {
     params_.func(source_data_, source_stride_, reference_data_,
                  reference_stride_);
   }
 };
 
-class SADavgTest : public SADTestBase<SadMxNAvgParam> {
+class SADavgTest : public AbstractBench, public SADTestBase<SadMxNAvgParam> {
  public:
   SADavgTest() : SADTestBase(GetParam()) {}
 
@@ -338,6 +338,11 @@ class SADavgTest : public SADTestBase<SadMxNAvgParam> {
 
     ASSERT_EQ(reference_sad, exp_sad);
   }
+
+  void Run() override {
+    params_.func(source_data_, source_stride_, reference_data_,
+                 reference_stride_, second_pred_);
+  }
 };
 
 TEST_P(SADTest, MaxRef) {
@@ -437,6 +442,19 @@ TEST_P(SADavgTest, ShortSrc) {
   source_stride_ = tmp_stride;
 }
 
+TEST_P(SADavgTest, DISABLED_Speed) {
+  const int kCountSpeedTestBlock = 50000000 / (params_.width * params_.height);
+  FillRandom(source_data_, source_stride_);
+  FillRandom(reference_data_, reference_stride_);
+  FillRandom(second_pred_, params_.width);
+
+  RunNTimes(kCountSpeedTestBlock);
+
+  char title[16];
+  snprintf(title, sizeof(title), "%dx%d", params_.width, params_.height);
+  PrintMedian(title);
+}
+
 TEST_P(SADx4Test, MaxRef) {
   FillConstant(source_data_, source_stride_, 0);
   FillConstant(GetReference(0), reference_stride_, mask_);
@@ -517,14 +535,12 @@ TEST_P(SADx4Test, DISABLED_Speed) {
   uint32_t reference_sad[4];
   DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[4]);
   vpx_usec_timer timer;
-
-  memset(reference_sad, 0, sizeof(reference_sad));
-  SADs(exp_sad);
+  for (int block = 0; block < 4; ++block) {
+    reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
+  }
   vpx_usec_timer_start(&timer);
   for (int i = 0; i < kCountSpeedTestBlock; ++i) {
-    for (int block = 0; block < 4; ++block) {
-      reference_sad[block] = ReferenceSAD(GetBlockRefOffset(block));
-    }
+    SADs(exp_sad);
   }
   vpx_usec_timer_mark(&timer);
   for (int block = 0; block < 4; ++block) {
@@ -729,6 +745,45 @@ const SadMxNParam neon_tests[] = {
   SadMxNParam(8, 4, &vpx_sad8x4_neon),
   SadMxNParam(4, 8, &vpx_sad4x8_neon),
   SadMxNParam(4, 4, &vpx_sad4x4_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 8),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 8),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 8),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 8),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 8),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 8),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 10),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 10),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 10),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 10),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 10),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 10),
+  SadMxNParam(4, 4, &vpx_highbd_sad4x4_neon, 12),
+  SadMxNParam(4, 8, &vpx_highbd_sad4x8_neon, 12),
+  SadMxNParam(8, 4, &vpx_highbd_sad8x4_neon, 12),
+  SadMxNParam(8, 8, &vpx_highbd_sad8x8_neon, 12),
+  SadMxNParam(8, 16, &vpx_highbd_sad8x16_neon, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_neon, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_neon, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_neon, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_neon, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_neon, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_neon, 12),
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADTest, ::testing::ValuesIn(neon_tests));
 
@@ -746,6 +801,47 @@ const SadMxNAvgParam avg_neon_tests[] = {
   SadMxNAvgParam(8, 4, &vpx_sad8x4_avg_neon),
   SadMxNAvgParam(4, 8, &vpx_sad4x8_avg_neon),
   SadMxNAvgParam(4, 4, &vpx_sad4x4_avg_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 8),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 8),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 8),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 8),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 8),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 10),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 10),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 10),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 10),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 10),
+  SadMxNAvgParam(4, 4, &vpx_highbd_sad4x4_avg_neon, 12),
+  SadMxNAvgParam(4, 8, &vpx_highbd_sad4x8_avg_neon, 12),
+  SadMxNAvgParam(8, 4, &vpx_highbd_sad8x4_avg_neon, 12),
+  SadMxNAvgParam(8, 8, &vpx_highbd_sad8x8_avg_neon, 12),
+  SadMxNAvgParam(8, 16, &vpx_highbd_sad8x16_avg_neon, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_neon, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_neon, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_neon, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_neon, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_neon, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_neon, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_neon, 12),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADavgTest, ::testing::ValuesIn(avg_neon_tests));
 
@@ -763,6 +859,44 @@ const SadMxNx4Param x4d_neon_tests[] = {
   SadMxNx4Param(8, 4, &vpx_sad8x4x4d_neon),
   SadMxNx4Param(4, 8, &vpx_sad4x8x4d_neon),
   SadMxNx4Param(4, 4, &vpx_sad4x4x4d_neon),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 8),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 8),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 8),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 8),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 8),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 10),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 10),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 10),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 10),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 10),
+  SadMxNx4Param(4, 4, &vpx_highbd_sad4x4x4d_neon, 12),
+  SadMxNx4Param(4, 8, &vpx_highbd_sad4x8x4d_neon, 12),
+  SadMxNx4Param(8, 4, &vpx_highbd_sad8x4x4d_neon, 12),
+  SadMxNx4Param(8, 8, &vpx_highbd_sad8x8x4d_neon, 12),
+  SadMxNx4Param(8, 16, &vpx_highbd_sad8x16x4d_neon, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_neon, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_neon, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_neon, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_neon, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_neon, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_neon, 12),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_neon, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(NEON, SADx4Test, ::testing::ValuesIn(x4d_neon_tests));
 #endif  // HAVE_NEON
@@ -948,6 +1082,34 @@ const SadMxNParam avx2_tests[] = {
   SadMxNParam(32, 64, &vpx_sad32x64_avx2),
   SadMxNParam(32, 32, &vpx_sad32x32_avx2),
   SadMxNParam(32, 16, &vpx_sad32x16_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 8),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 8),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 8),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 8),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 8),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 8),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 8),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 8),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 10),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 10),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 10),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 10),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 10),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 10),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 10),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 10),
+
+  SadMxNParam(64, 64, &vpx_highbd_sad64x64_avx2, 12),
+  SadMxNParam(64, 32, &vpx_highbd_sad64x32_avx2, 12),
+  SadMxNParam(32, 64, &vpx_highbd_sad32x64_avx2, 12),
+  SadMxNParam(32, 32, &vpx_highbd_sad32x32_avx2, 12),
+  SadMxNParam(32, 16, &vpx_highbd_sad32x16_avx2, 12),
+  SadMxNParam(16, 32, &vpx_highbd_sad16x32_avx2, 12),
+  SadMxNParam(16, 16, &vpx_highbd_sad16x16_avx2, 12),
+  SadMxNParam(16, 8, &vpx_highbd_sad16x8_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADTest, ::testing::ValuesIn(avx2_tests));
 
@@ -957,12 +1119,64 @@ const SadMxNAvgParam avg_avx2_tests[] = {
   SadMxNAvgParam(32, 64, &vpx_sad32x64_avg_avx2),
   SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_avx2),
   SadMxNAvgParam(32, 16, &vpx_sad32x16_avg_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 8),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 8),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 8),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 8),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 8),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 8),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 10),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 10),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 10),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 10),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 10),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 10),
+  SadMxNAvgParam(64, 64, &vpx_highbd_sad64x64_avg_avx2, 12),
+  SadMxNAvgParam(64, 32, &vpx_highbd_sad64x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 64, &vpx_highbd_sad32x64_avg_avx2, 12),
+  SadMxNAvgParam(32, 32, &vpx_highbd_sad32x32_avg_avx2, 12),
+  SadMxNAvgParam(32, 16, &vpx_highbd_sad32x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 32, &vpx_highbd_sad16x32_avg_avx2, 12),
+  SadMxNAvgParam(16, 16, &vpx_highbd_sad16x16_avg_avx2, 12),
+  SadMxNAvgParam(16, 8, &vpx_highbd_sad16x8_avg_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADavgTest, ::testing::ValuesIn(avg_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
   SadMxNx4Param(64, 64, &vpx_sad64x64x4d_avx2),
   SadMxNx4Param(32, 32, &vpx_sad32x32x4d_avx2),
+#if CONFIG_VP9_HIGHBITDEPTH
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 8),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 8),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 8),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 8),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 8),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 8),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 8),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 8),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 10),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 10),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 10),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 10),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 10),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 10),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 10),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 10),
+  SadMxNx4Param(64, 64, &vpx_highbd_sad64x64x4d_avx2, 12),
+  SadMxNx4Param(64, 32, &vpx_highbd_sad64x32x4d_avx2, 12),
+  SadMxNx4Param(32, 64, &vpx_highbd_sad32x64x4d_avx2, 12),
+  SadMxNx4Param(32, 32, &vpx_highbd_sad32x32x4d_avx2, 12),
+  SadMxNx4Param(32, 16, &vpx_highbd_sad32x16x4d_avx2, 12),
+  SadMxNx4Param(16, 32, &vpx_highbd_sad16x32x4d_avx2, 12),
+  SadMxNx4Param(16, 16, &vpx_highbd_sad16x16x4d_avx2, 12),
+  SadMxNx4Param(16, 8, &vpx_highbd_sad16x8x4d_avx2, 12),
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
diff --git a/libvpx/test/svc_datarate_test.cc b/libvpx/test/svc_datarate_test.cc
index 291cb0128..484252ca4 100644
--- a/libvpx/test/svc_datarate_test.cc
+++ b/libvpx/test/svc_datarate_test.cc
@@ -548,13 +548,16 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
       }
 
       if (!single_layer_resize_) {
-        ASSERT_EQ(pkt->data.frame.width[sl],
-                  top_sl_width_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
-
-        ASSERT_EQ(pkt->data.frame.height[sl],
-                  top_sl_height_ * svc_params_.scaling_factor_num[sl] /
-                      svc_params_.scaling_factor_den[sl]);
+        unsigned int scaled_width = top_sl_width_ *
+                                    svc_params_.scaling_factor_num[sl] /
+                                    svc_params_.scaling_factor_den[sl];
+        if (scaled_width % 2 != 0) scaled_width += 1;
+        ASSERT_EQ(pkt->data.frame.width[sl], scaled_width);
+        unsigned int scaled_height = top_sl_height_ *
+                                     svc_params_.scaling_factor_num[sl] /
+                                     svc_params_.scaling_factor_den[sl];
+        if (scaled_height % 2 != 0) scaled_height += 1;
+        ASSERT_EQ(pkt->data.frame.height[sl], scaled_height);
       } else if (superframe_count_ > 0) {
         if (pkt->data.frame.width[sl] < prev_frame_width[sl] &&
             pkt->data.frame.height[sl] < prev_frame_height[sl])
@@ -568,7 +571,7 @@ class DatarateOnePassCbrSvc : public OnePassCbrSvc {
     }
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     if (change_bitrate_) last_pts_ = last_pts_ - last_pts_ref_;
     duration_ = (last_pts_ + 1) * timebase_;
     for (int sl = 0; sl < number_spatial_layers_; ++sl) {
@@ -678,6 +681,152 @@ class DatarateOnePassCbrSvcSingleBR
   }
 };
 
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for 4:4:4 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL444Profile1) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 352;
+  top_sl_height_ = 288;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.78,
+                          1.15);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 3
+// temporal layers, for 4:2:2 Profile 1.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc2SL3TL422Profile1) {
+  SetSvcConfig(2, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_8_422.y4m", 0, 20);
+  cfg_.g_profile = 1;
+  cfg_.g_bit_depth = VPX_BITS_8;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 10bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL10bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_10_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_10;
+  cfg_.g_input_bit_depth = VPX_BITS_10;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
+// Check basic rate targeting for 1 pass CBR SVC: 3 spatial layers and 3
+// temporal layers, for Profle 2 12bit.
+TEST_P(DatarateOnePassCbrSvcSingleBR, OnePassCbrSvc3SL3TL12bitProfile2) {
+  SetSvcConfig(3, 3);
+  ::libvpx_test::Y4mVideoSource video("park_joy_90p_12_420_20f.y4m", 0, 20);
+  cfg_.g_profile = 2;
+  cfg_.g_bit_depth = VPX_BITS_12;
+  cfg_.g_input_bit_depth = VPX_BITS_12;
+  if (cfg_.g_bit_depth > 8) init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.kf_max_dist = 9999;
+
+  top_sl_width_ = 160;
+  top_sl_height_ = 90;
+  cfg_.rc_target_bitrate = 500;
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // TODO(marpan/jianj): Comment out the rate-target checking for now
+  // as superframe parsing to get frame size needs to be fixed for
+  // high bitdepth.
+  /*
+  // Use large under/over shoot thresholds as this is a very short clip,
+  // so not good for testing rate-targeting.
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.5,
+                          1.7);
+  */
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+#endif
+
 // Check basic rate targeting for 1 pass CBR SVC: 2 spatial layers and 1
 // temporal layer, with screen content mode on and same speed setting for all
 // layers.
@@ -1054,6 +1203,37 @@ TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassCbrSvc2SL3TL) {
 #endif
 }
 
+// Check basic rate targeting for 1 pass VBR SVC: 2 spatial layers and
+// 3 temporal layers. Run VGA clip with 1 thread.
+TEST_P(DatarateOnePassCbrSvcMultiBR, OnePassVbrSvc2SL3TL) {
+  SetSvcConfig(2, 3);
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_min_quantizer = 2;
+  cfg_.rc_max_quantizer = 56;
+  cfg_.g_threads = 1;
+  cfg_.rc_dropframe_thresh = 30;
+  cfg_.kf_max_dist = 9999;
+  cfg_.rc_end_usage = VPX_VBR;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, 400);
+  top_sl_width_ = 640;
+  top_sl_height_ = 480;
+  const int bitrates[3] = { 200, 400, 600 };
+  cfg_.rc_target_bitrate = bitrates[GET_PARAM(2)];
+  ResetModel();
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
+                          1.3);
+#if CONFIG_VP9_DECODER
+  // The non-reference frames are expected to be mismatched frames as the
+  // encoder will avoid loopfilter on these frames.
+  EXPECT_EQ(GetNonRefFrames(), GetMismatchFrames());
+#endif
+}
+
 // Params: speed setting, layer framedrop control and index for bitrate array.
 class DatarateOnePassCbrSvcFrameDropMultiBR
     : public DatarateOnePassCbrSvc,
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 6df457290..f60d8f823 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -59,6 +59,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_test.h
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += svc_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += timestamp_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ext_ratectrl_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += ../vp9/simple_encode.h
 
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.cc
 LIBVPX_TEST_SRCS-yes                   += decode_test_driver.h
diff --git a/libvpx/test/tools_common.sh b/libvpx/test/tools_common.sh
index 844a12534..0e4a0a5c0 100755
--- a/libvpx/test/tools_common.sh
+++ b/libvpx/test/tools_common.sh
@@ -133,7 +133,7 @@ vpx_config_option_enabled() {
   vpx_config_option="${1}"
   vpx_config_file="${LIBVPX_CONFIG_PATH}/vpx_config.h"
   config_line=$(grep "${vpx_config_option}" "${vpx_config_file}")
-  if echo "${config_line}" | egrep -q '1$'; then
+  if echo "${config_line}" | grep -E -q '1$'; then
     echo yes
   fi
 }
@@ -222,7 +222,7 @@ filter_strings() {
 
   if [ -n "${filter}" ]; then
     for s in ${strings}; do
-      if echo "${s}" | egrep -q ${exclude} "${filter}" > /dev/null 2>&1; then
+      if echo "${s}" | grep -E -q ${exclude} "${filter}" > /dev/null 2>&1; then
         filtered_strings="${filtered_strings} ${s}"
       fi
     done
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index 80855052d..a6c8ef048 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -488,8 +488,8 @@ void MainTestClass<VarianceFunctionType>::SpeedTest() {
   }
   vpx_usec_timer_mark(&timer);
   const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
-  printf("Variance %dx%d time: %5d ms\n", width(), height(),
-         elapsed_time / 1000);
+  printf("Variance %dx%d %dbpp time: %5d ms\n", width(), height(),
+         params_.bit_depth, elapsed_time / 1000);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -499,14 +499,21 @@ template <typename FunctionType>
 void MainTestClass<FunctionType>::RefTestMse() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size(); ++j) {
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
+      if (!use_high_bit_depth()) {
+        src_[j] = rnd_.Rand8();
+        ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask();
+        CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask();
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
     }
     unsigned int sse1, sse2;
     const int stride = width();
     ASM_REGISTER_STATE_CHECK(params_.func(src_, stride, ref_, stride, &sse1));
     variance_ref(src_, ref_, params_.log2width, params_.log2height, stride,
-                 stride, &sse2, false, VPX_BITS_8);
+                 stride, &sse2, use_high_bit_depth(), params_.bit_depth);
     EXPECT_EQ(sse1, sse2);
   }
 }
@@ -530,8 +537,15 @@ void MainTestClass<FunctionType>::RefTestSse() {
 
 template <typename FunctionType>
 void MainTestClass<FunctionType>::MaxTestMse() {
-  memset(src_, 255, block_size());
-  memset(ref_, 0, block_size());
+  if (!use_high_bit_depth()) {
+    memset(src_, 255, block_size());
+    memset(ref_, 0, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << byte_shift(), block_size());
+    vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
   unsigned int sse;
   ASM_REGISTER_STATE_CHECK(params_.func(src_, width(), ref_, width(), &sse));
   const unsigned int expected = block_size() * 255 * 255;
@@ -854,25 +868,25 @@ TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
-/* TODO(debargha): This test does not support the highbd version
 typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
 TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
 TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
+TEST_P(VpxHBDMseTest, DISABLED_Speed) { SpeedTest(); }
 INSTANTIATE_TEST_SUITE_P(
     C, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_12_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_10_mse8x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x8_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x16_c),
-                      MseParams(4, 4, &vpx_highbd_8_mse8x8_c)));
-*/
+    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_c, VPX_BITS_12),
+                      MseParams(4, 3, &vpx_highbd_12_mse16x8_c, VPX_BITS_12),
+                      MseParams(3, 4, &vpx_highbd_12_mse8x16_c, VPX_BITS_12),
+                      MseParams(3, 3, &vpx_highbd_12_mse8x8_c, VPX_BITS_12),
+                      MseParams(4, 4, &vpx_highbd_10_mse16x16_c, VPX_BITS_10),
+                      MseParams(4, 3, &vpx_highbd_10_mse16x8_c, VPX_BITS_10),
+                      MseParams(3, 4, &vpx_highbd_10_mse8x16_c, VPX_BITS_10),
+                      MseParams(3, 3, &vpx_highbd_10_mse8x8_c, VPX_BITS_10),
+                      MseParams(4, 4, &vpx_highbd_8_mse16x16_c, VPX_BITS_8),
+                      MseParams(4, 3, &vpx_highbd_8_mse16x8_c, VPX_BITS_8),
+                      MseParams(3, 4, &vpx_highbd_8_mse8x16_c, VPX_BITS_8),
+                      MseParams(3, 3, &vpx_highbd_8_mse8x8_c, VPX_BITS_8)));
+
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(VpxHBDMseTest);
 
 INSTANTIATE_TEST_SUITE_P(
@@ -1138,22 +1152,15 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 
 #if CONFIG_VP9_HIGHBITDEPTH
-/* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDMseTest,
-    ::testing::Values(MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_12_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_12_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_10_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_10_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2),
-                      MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2),
-                      MseParams(4, 3, &vpx_highbd_8_mse16x8_sse2),
-                      MseParams(3, 4, &vpx_highbd_8_mse8x16_sse2),
-                      MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2)));
-*/
+    ::testing::Values(
+        MseParams(4, 4, &vpx_highbd_12_mse16x16_sse2, VPX_BITS_12),
+        MseParams(3, 3, &vpx_highbd_12_mse8x8_sse2, VPX_BITS_12),
+        MseParams(4, 4, &vpx_highbd_10_mse16x16_sse2, VPX_BITS_10),
+        MseParams(3, 3, &vpx_highbd_10_mse8x8_sse2, VPX_BITS_10),
+        MseParams(4, 4, &vpx_highbd_8_mse16x16_sse2, VPX_BITS_8),
+        MseParams(3, 3, &vpx_highbd_8_mse8x8_sse2, VPX_BITS_8)));
 
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VpxHBDVarianceTest,
@@ -1495,6 +1502,224 @@ INSTANTIATE_TEST_SUITE_P(
         SubpelAvgVarianceParams(3, 2, &vpx_sub_pixel_avg_variance8x4_neon, 0),
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_neon, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_neon, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDVarianceTest,
+    ::testing::Values(
+        VarianceParams(6, 6, &vpx_highbd_12_variance64x64_neon, 12),
+        VarianceParams(6, 5, &vpx_highbd_12_variance64x32_neon, 12),
+        VarianceParams(5, 6, &vpx_highbd_12_variance32x64_neon, 12),
+        VarianceParams(5, 5, &vpx_highbd_12_variance32x32_neon, 12),
+        VarianceParams(5, 4, &vpx_highbd_12_variance32x16_neon, 12),
+        VarianceParams(4, 5, &vpx_highbd_12_variance16x32_neon, 12),
+        VarianceParams(4, 4, &vpx_highbd_12_variance16x16_neon, 12),
+        VarianceParams(4, 3, &vpx_highbd_12_variance16x8_neon, 12),
+        VarianceParams(3, 4, &vpx_highbd_12_variance8x16_neon, 12),
+        VarianceParams(3, 3, &vpx_highbd_12_variance8x8_neon, 12),
+        VarianceParams(3, 2, &vpx_highbd_12_variance8x4_neon, 12),
+        VarianceParams(2, 3, &vpx_highbd_12_variance4x8_neon, 12),
+        VarianceParams(2, 2, &vpx_highbd_12_variance4x4_neon, 12),
+        VarianceParams(6, 6, &vpx_highbd_10_variance64x64_neon, 10),
+        VarianceParams(6, 5, &vpx_highbd_10_variance64x32_neon, 10),
+        VarianceParams(5, 6, &vpx_highbd_10_variance32x64_neon, 10),
+        VarianceParams(5, 5, &vpx_highbd_10_variance32x32_neon, 10),
+        VarianceParams(5, 4, &vpx_highbd_10_variance32x16_neon, 10),
+        VarianceParams(4, 5, &vpx_highbd_10_variance16x32_neon, 10),
+        VarianceParams(4, 4, &vpx_highbd_10_variance16x16_neon, 10),
+        VarianceParams(4, 3, &vpx_highbd_10_variance16x8_neon, 10),
+        VarianceParams(3, 4, &vpx_highbd_10_variance8x16_neon, 10),
+        VarianceParams(3, 3, &vpx_highbd_10_variance8x8_neon, 10),
+        VarianceParams(3, 2, &vpx_highbd_10_variance8x4_neon, 10),
+        VarianceParams(2, 3, &vpx_highbd_10_variance4x8_neon, 10),
+        VarianceParams(2, 2, &vpx_highbd_10_variance4x4_neon, 10),
+        VarianceParams(6, 6, &vpx_highbd_8_variance64x64_neon, 8),
+        VarianceParams(6, 5, &vpx_highbd_8_variance64x32_neon, 8),
+        VarianceParams(5, 6, &vpx_highbd_8_variance32x64_neon, 8),
+        VarianceParams(5, 5, &vpx_highbd_8_variance32x32_neon, 8),
+        VarianceParams(5, 4, &vpx_highbd_8_variance32x16_neon, 8),
+        VarianceParams(4, 5, &vpx_highbd_8_variance16x32_neon, 8),
+        VarianceParams(4, 4, &vpx_highbd_8_variance16x16_neon, 8),
+        VarianceParams(4, 3, &vpx_highbd_8_variance16x8_neon, 8),
+        VarianceParams(3, 4, &vpx_highbd_8_variance8x16_neon, 8),
+        VarianceParams(3, 3, &vpx_highbd_8_variance8x8_neon, 8),
+        VarianceParams(3, 2, &vpx_highbd_8_variance8x4_neon, 8),
+        VarianceParams(2, 3, &vpx_highbd_8_variance4x8_neon, 8),
+        VarianceParams(2, 2, &vpx_highbd_8_variance4x4_neon, 8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_neon,
+                             12),
+        SubpelVarianceParams(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_neon,
+                             12),
+        SubpelVarianceParams(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_neon,
+                             12),
+        SubpelVarianceParams(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_neon,
+                             12),
+        SubpelVarianceParams(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_neon,
+                             12),
+        SubpelVarianceParams(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_neon,
+                             12),
+        SubpelVarianceParams(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_neon,
+                             12),
+        SubpelVarianceParams(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_neon,
+                             12),
+        SubpelVarianceParams(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_neon,
+                             12),
+        SubpelVarianceParams(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_neon,
+                             12),
+        SubpelVarianceParams(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_neon,
+                             12),
+        SubpelVarianceParams(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_neon,
+                             10),
+        SubpelVarianceParams(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_neon,
+                             10),
+        SubpelVarianceParams(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_neon,
+                             10),
+        SubpelVarianceParams(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_neon,
+                             10),
+        SubpelVarianceParams(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_neon,
+                             10),
+        SubpelVarianceParams(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_neon,
+                             10),
+        SubpelVarianceParams(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_neon,
+                             10),
+        SubpelVarianceParams(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_neon,
+                             10),
+        SubpelVarianceParams(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_neon,
+                             10),
+        SubpelVarianceParams(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_neon,
+                             10),
+        SubpelVarianceParams(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_neon,
+                             10),
+        SubpelVarianceParams(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_neon,
+                             8),
+        SubpelVarianceParams(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_neon,
+                             8),
+        SubpelVarianceParams(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_neon,
+                             8),
+        SubpelVarianceParams(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_neon,
+                             8),
+        SubpelVarianceParams(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_neon,
+                             8),
+        SubpelVarianceParams(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_neon,
+                             8),
+        SubpelVarianceParams(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_neon,
+                             8),
+        SubpelVarianceParams(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_neon,
+                             8),
+        SubpelVarianceParams(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_neon,
+                             8),
+        SubpelVarianceParams(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_neon, 8),
+        SubpelVarianceParams(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_neon,
+                             8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x64_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance64x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x64_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x32_neon,
+                                12),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance32x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x32_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x16_neon,
+                                12),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance16x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x16_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x8_neon,
+                                12),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_12_sub_pixel_avg_variance8x4_neon,
+                                12),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x64_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance64x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x64_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x32_neon,
+                                10),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance32x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x32_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x16_neon,
+                                10),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance16x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x16_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x8_neon,
+                                10),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_10_sub_pixel_avg_variance8x4_neon,
+                                10),
+        SubpelAvgVarianceParams(6, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x64_neon,
+                                8),
+        SubpelAvgVarianceParams(6, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance64x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 6,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x64_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x32_neon,
+                                8),
+        SubpelAvgVarianceParams(5, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance32x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 5,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x32_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x16_neon,
+                                8),
+        SubpelAvgVarianceParams(4, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance16x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 4,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x16_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 3,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x8_neon,
+                                8),
+        SubpelAvgVarianceParams(3, 2,
+                                &vpx_highbd_8_sub_pixel_avg_variance8x4_neon,
+                                8)));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_MSA
diff --git a/libvpx/test/vp8_datarate_test.cc b/libvpx/test/vp8_datarate_test.cc
index dcd68a2d4..64a861d15 100644
--- a/libvpx/test/vp8_datarate_test.cc
+++ b/libvpx/test/vp8_datarate_test.cc
@@ -121,7 +121,7 @@ class DatarateTestLarge
     ++frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     if (bits_total_) {
       const double file_size_in_kb = bits_total_ / 1000.;  // bits per kilobit
 
diff --git a/libvpx/test/vp8_ratectrl_rtc_test.cc b/libvpx/test/vp8_ratectrl_rtc_test.cc
index ad310666e..7410f3c01 100644
--- a/libvpx/test/vp8_ratectrl_rtc_test.cc
+++ b/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -127,8 +127,7 @@ class Vp8RcInterfaceTest
         encoder->Control(VP8E_SET_CPUUSED, -6);
         encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
         encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
-      }
-      if (frame_params_.frame_type == INTER_FRAME) {
+      } else if (frame_params_.frame_type == INTER_FRAME) {
         // Disable golden frame update.
         frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
         frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
diff --git a/libvpx/test/vp9_datarate_test.cc b/libvpx/test/vp9_datarate_test.cc
index 9930c754c..7e9180749 100644
--- a/libvpx/test/vp9_datarate_test.cc
+++ b/libvpx/test/vp9_datarate_test.cc
@@ -9,6 +9,7 @@
  */
 #include "./vpx_config.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
 #include "test/codec_factory.h"
 #include "test/encode_test_driver.h"
 #include "test/i420_video_source.h"
@@ -147,14 +148,16 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
       if (video->frame() == 0) {
         encoder->Control(VP9E_SET_SVC, 1);
       }
-      vpx_svc_layer_id_t layer_id;
-      layer_id.spatial_layer_id = 0;
-      frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      layer_id.temporal_layer_id_per_spatial[0] =
-          SetLayerId(video->frame(), cfg_.ts_number_layers);
-      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      if (cfg_.temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+        vpx_svc_layer_id_t layer_id;
+        frame_flags_ = GetFrameFlags(video->frame(), cfg_.ts_number_layers);
+        layer_id.spatial_layer_id = 0;
+        layer_id.temporal_layer_id =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        layer_id.temporal_layer_id_per_spatial[0] =
+            SetLayerId(video->frame(), cfg_.ts_number_layers);
+        encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      }
     }
     const vpx_rational_t tb = video->timebase();
     timebase_ = static_cast<double>(tb.num) / tb.den;
@@ -199,7 +202,7 @@ class DatarateTestVP9 : public ::libvpx_test::EncoderTest {
     ++tot_frame_number_;
   }
 
-  virtual void EndPassHook(void) {
+  virtual void EndPassHook() {
     for (int layer = 0; layer < static_cast<int>(cfg_.ts_number_layers);
          ++layer) {
       duration_ = (last_pts_ + 1) * timebase_;
@@ -809,6 +812,135 @@ TEST_P(DatarateTestVP9PostEncodeDrop, PostEncodeDropScreenContent) {
       << " The datarate for the file is greater than target by too much!";
 }
 
+using libvpx_test::ACMRandom;
+
+class DatarateTestVP9FrameQp
+    : public DatarateTestVP9,
+      public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
+ public:
+  DatarateTestVP9FrameQp() : DatarateTestVP9(GetParam()), frame_(0) {}
+  virtual ~DatarateTestVP9FrameQp() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    ResetModel();
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    set_cpu_used_ = 7;
+    DatarateTestVP9::PreEncodeFrameHook(video, encoder);
+    frame_qp_ = static_cast<int>(rnd_.RandRange(64));
+    encoder->Control(VP9E_SET_QUANTIZER_ONE_PASS, frame_qp_);
+    frame_++;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    int qp = 0;
+    vpx_svc_layer_id_t layer_id;
+    if (frame_ >= total_frame_) return;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER_64, &qp);
+    ASSERT_EQ(frame_qp_, qp);
+    encoder->Control(VP9E_GET_SVC_LAYER_ID, &layer_id);
+    temporal_layer_id_ = layer_id.temporal_layer_id;
+  }
+
+  virtual void MismatchHook(const vpx_image_t * /*img1*/,
+                            const vpx_image_t * /*img2*/) {
+    if (frame_ >= total_frame_) return;
+    ASSERT_TRUE(cfg_.temporal_layering_mode ==
+                    VP9E_TEMPORAL_LAYERING_MODE_0212 &&
+                temporal_layer_id_ == 2);
+  }
+
+ protected:
+  int total_frame_;
+
+ private:
+  ACMRandom rnd_;
+  int frame_qp_;
+  int frame_;
+  int temporal_layer_id_;
+};
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersBypass) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  cfg_.rc_target_bitrate = 200;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
+TEST_P(DatarateTestVP9FrameQp, VP9SetFrameQp3TemporalLayersFixedMode) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_dropframe_thresh = 0;
+  cfg_.rc_max_quantizer = 63;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.g_lag_in_frames = 0;
+
+  // 3 Temporal layers, no spatial layers: Framerate decimation (4, 2, 1).
+  cfg_.ss_number_layers = 1;
+  cfg_.ts_number_layers = 3;
+  cfg_.ts_rate_decimator[0] = 4;
+  cfg_.ts_rate_decimator[1] = 2;
+  cfg_.ts_rate_decimator[2] = 1;
+
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_0212;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_error_resilient = 1;
+  total_frame_ = 400;
+  ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
+                                       0, total_frame_);
+  ResetModel();
+  cfg_.layer_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+  cfg_.layer_target_bitrate[2] = cfg_.rc_target_bitrate;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
 // Params: speed setting.
 class DatarateTestVP9RealTimeDenoiser : public DatarateTestVP9RealTime {
@@ -943,6 +1075,13 @@ VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9LargeVBR, ::testing::Range(5, 9),
 
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTime, ::testing::Range(5, 10));
 
+#if CONFIG_VP9
+INSTANTIATE_TEST_SUITE_P(
+    VP9, DatarateTestVP9FrameQp,
+    ::testing::Values(
+        static_cast<const libvpx_test::CodecFactory *>(&libvpx_test::kVP9)));
+#endif
+
 VP9_INSTANTIATE_TEST_SUITE(DatarateTestVP9RealTimeDeltaQUV,
                            ::testing::Range(5, 10),
                            ::testing::Values(-5, -10, -15));
diff --git a/libvpx/test/vp9_ext_ratectrl_test.cc b/libvpx/test/vp9_ext_ratectrl_test.cc
index 60a350b84..2bfa6281d 100644
--- a/libvpx/test/vp9_ext_ratectrl_test.cc
+++ b/libvpx/test/vp9_ext_ratectrl_test.cc
@@ -16,28 +16,50 @@
 #include "test/util.h"
 #include "test/yuv_video_source.h"
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/simple_encode.h"
 #include "vpx/vpx_ext_ratectrl.h"
+#include "vpx_dsp/vpx_dsp_common.h"
 
 namespace {
 
 constexpr int kModelMagicNumber = 51396;
 constexpr uintptr_t PrivMagicNumber = 5566;
 constexpr int kFrameNum = 5;
+constexpr int kFrameNumGOP = 30;
+constexpr int kFrameNumGOPShort = 4;
 constexpr int kLosslessCodingIndex = 2;
+constexpr int kFixedGOPSize = 9;
+// The range check in vp9_cx_iface.c shows that the max
+// lag in buffer is MAX_LAG_BUFFERS (25):
+// RANGE_CHECK_HI(cfg, g_lag_in_frames, MAX_LAG_BUFFERS);
+constexpr int kMaxLagInFrames = 25;
+constexpr int kDefaultMinGfInterval = 4;
+constexpr int kDefaultMaxGfInterval = 16;
+// The active gf interval might change for each GOP
+// See function "get_active_gf_inverval_range".
+// The numbers below are from manual inspection.
+constexpr int kReadMinGfInterval = 5;
+constexpr int kReadMaxGfInterval = 13;
+const char kTestFileName[] = "bus_352x288_420_f20_b8.yuv";
+const double kPsnrThreshold = 30.50;
 
 struct ToyRateCtrl {
   int magic_number;
   int coding_index;
+
+  int gop_global_index;
+  int frames_since_key;
+  int show_index;
 };
 
 vpx_rc_status_t rc_create_model(void *priv,
                                 const vpx_rc_config_t *ratectrl_config,
-                                vpx_rc_model_t *rate_ctrl_model_pt) {
+                                vpx_rc_model_t *rate_ctrl_model_ptr) {
   ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
-  EXPECT_NE(toy_rate_ctrl, nullptr);
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
   toy_rate_ctrl->magic_number = kModelMagicNumber;
   toy_rate_ctrl->coding_index = -1;
-  *rate_ctrl_model_pt = toy_rate_ctrl;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
   EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
   EXPECT_EQ(ratectrl_config->frame_width, 352);
   EXPECT_EQ(ratectrl_config->frame_height, 288);
@@ -48,6 +70,48 @@ vpx_rc_status_t rc_create_model(void *priv,
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_create_model_gop(void *priv,
+                                    const vpx_rc_config_t *ratectrl_config,
+                                    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_global_index = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 640);
+  EXPECT_EQ(ratectrl_config->frame_height, 360);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOP);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 4000);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_create_model_gop_short(
+    void *priv, const vpx_rc_config_t *ratectrl_config,
+    vpx_rc_model_t *rate_ctrl_model_ptr) {
+  ToyRateCtrl *toy_rate_ctrl = new (std::nothrow) ToyRateCtrl;
+  if (toy_rate_ctrl == nullptr) return VPX_RC_ERROR;
+  toy_rate_ctrl->magic_number = kModelMagicNumber;
+  toy_rate_ctrl->gop_global_index = 0;
+  toy_rate_ctrl->frames_since_key = 0;
+  toy_rate_ctrl->show_index = 0;
+  toy_rate_ctrl->coding_index = 0;
+  *rate_ctrl_model_ptr = toy_rate_ctrl;
+  EXPECT_EQ(priv, reinterpret_cast<void *>(PrivMagicNumber));
+  EXPECT_EQ(ratectrl_config->frame_width, 352);
+  EXPECT_EQ(ratectrl_config->frame_height, 288);
+  EXPECT_EQ(ratectrl_config->show_frame_count, kFrameNumGOPShort);
+  EXPECT_EQ(ratectrl_config->target_bitrate_kbps, 500);
+  EXPECT_EQ(ratectrl_config->frame_rate_num, 30);
+  EXPECT_EQ(ratectrl_config->frame_rate_den, 1);
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_send_firstpass_stats(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_firstpass_stats_t *first_pass_stats) {
@@ -61,6 +125,32 @@ vpx_rc_status_t rc_send_firstpass_stats(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_send_firstpass_stats_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOP);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_send_firstpass_stats_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_firstpass_stats_t *first_pass_stats) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(first_pass_stats->num_frames, kFrameNumGOPShort);
+  for (int i = 0; i < first_pass_stats->num_frames; ++i) {
+    EXPECT_DOUBLE_EQ(first_pass_stats->frame_stats[i].frame, i);
+  }
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_get_encodeframe_decision(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_info_t *encode_frame_info,
@@ -76,19 +166,17 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   if (encode_frame_info->coding_index == 0) {
     EXPECT_EQ(encode_frame_info->show_index, 0);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 0 /*kFrameTypeKey*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               0);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
               0);  // kRefFrameTypePast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
               0);  // kRefFrameTypeFuture
-  }
-
-  if (encode_frame_info->coding_index == 1) {
+  } else if (encode_frame_info->coding_index == 1) {
     EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 1);
-    EXPECT_EQ(encode_frame_info->frame_type, 2 /*kFrameTypeAltRef*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -97,19 +185,15 @@ vpx_rc_status_t rc_get_encodeframe_decision(
               0);  // kRefFrameTypeFuture
     EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
               0);  // kRefFrameTypeLast
-  }
-
-  if (encode_frame_info->coding_index >= 2 &&
-      encode_frame_info->coding_index < 5) {
+  } else if (encode_frame_info->coding_index >= 2 &&
+             encode_frame_info->coding_index < 5) {
     // In the first group of pictures, coding_index and gop_index are equal.
     EXPECT_EQ(encode_frame_info->gop_index, encode_frame_info->coding_index);
-    EXPECT_EQ(encode_frame_info->frame_type, 1 /*kFrameTypeInter*/);
-  }
-
-  if (encode_frame_info->coding_index == 5) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+  } else if (encode_frame_info->coding_index == 5) {
     EXPECT_EQ(encode_frame_info->show_index, 4);
     EXPECT_EQ(encode_frame_info->gop_index, 0);
-    EXPECT_EQ(encode_frame_info->frame_type, 3 /*kFrameTypeOverlay*/);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
               1);  // kRefFrameTypeLast
     EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
@@ -133,6 +217,388 @@ vpx_rc_status_t rc_get_encodeframe_decision(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_get_encodeframe_decision_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOP);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+  } else if (encode_frame_info->coding_index == 3 ||
+             encode_frame_info->coding_index == 12 ||
+             encode_frame_info->coding_index == 21) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+  } else if (encode_frame_info->coding_index == 11 ||
+             encode_frame_info->coding_index == 20 ||
+             encode_frame_info->coding_index == 29) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+  } else if (encode_frame_info->coding_index >= 30) {
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeGolden);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_overlay(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeAltRef);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 4) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeOverlay);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 2);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_encodeframe_decision_gop_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info,
+    vpx_rc_encodeframe_decision_t *frame_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  if (encode_frame_info->coding_index == 0) {
+    EXPECT_EQ(encode_frame_info->show_index, 0);
+    EXPECT_EQ(encode_frame_info->gop_index, 0);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeKey);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 1) {
+    EXPECT_EQ(encode_frame_info->show_index, 1);
+    EXPECT_EQ(encode_frame_info->gop_index, 1);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[0],
+              1);  // kRefFrameTypeLast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[1],
+              0);  // kRefFrameTypePast
+    EXPECT_EQ(encode_frame_info->ref_frame_valid_list[2],
+              0);  // kRefFrameTypeFuture
+    EXPECT_EQ(encode_frame_info->ref_frame_coding_indexes[0],
+              0);  // kRefFrameTypeLast
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 2) {
+    EXPECT_EQ(encode_frame_info->show_index, 2);
+    EXPECT_EQ(encode_frame_info->gop_index, 2);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  } else if (encode_frame_info->coding_index == 3) {
+    EXPECT_EQ(encode_frame_info->show_index, 3);
+    EXPECT_EQ(encode_frame_info->gop_index, 3);
+    EXPECT_EQ(encode_frame_info->frame_type, vp9::kFrameTypeInter);
+    EXPECT_EQ(toy_rate_ctrl->gop_global_index, 1);
+  }
+
+  // When the model recommends an invalid q, valid range [0, 255],
+  // the encoder will ignore it and use the default q selected
+  // by libvpx rate control strategy.
+  frame_decision->q_index = VPX_DEFAULT_Q;
+  frame_decision->max_frame_size = 0;
+
+  toy_rate_ctrl->coding_index += 1;
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_gop_decision(vpx_rc_model_t rate_ctrl_model,
+                                    const vpx_rc_gop_info_t *gop_info,
+                                    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->active_min_gf_interval, kReadMinGfInterval);
+  EXPECT_EQ(gop_info->active_max_gf_interval, kReadMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames =
+      VPXMIN(kFixedGOPSize, gop_info->frames_to_key);
+  gop_decision->use_alt_ref = gop_decision->gop_coding_frames == kFixedGOPSize;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 3 coding frames, no alt ref.
+// The second GOP has 1 coding frame, no alt ref.
+vpx_rc_status_t rc_get_gop_decision_short(vpx_rc_model_t rate_ctrl_model,
+                                          const vpx_rc_gop_info_t *gop_info,
+                                          vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 3 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 2 GOPs.
+// The first GOP has 4 coding frames. Use alt ref.
+// The second GOP only contains the overlay frame of the first GOP's alt ref
+// frame.
+vpx_rc_status_t rc_get_gop_decision_short_overlay(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 1);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = gop_info->is_key_frame ? 1 : 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
+// Test on a 4 frame video.
+// Test a setting of 1 GOP.
+// The GOP has 4 coding frames. Do not use alt ref.
+vpx_rc_status_t rc_get_gop_decision_short_no_arf(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_EQ(gop_info->lag_in_frames, kMaxLagInFrames - 1);
+  EXPECT_EQ(gop_info->min_gf_interval, kDefaultMinGfInterval);
+  EXPECT_EQ(gop_info->max_gf_interval, kDefaultMaxGfInterval);
+  EXPECT_EQ(gop_info->allow_alt_ref, 1);
+  if (gop_info->is_key_frame) {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+    EXPECT_EQ(gop_info->frames_since_key, 0);
+    EXPECT_EQ(gop_info->gop_global_index, 0);
+    toy_rate_ctrl->gop_global_index = 0;
+    toy_rate_ctrl->frames_since_key = 0;
+  } else {
+    EXPECT_EQ(gop_info->last_gop_use_alt_ref, 0);
+  }
+  EXPECT_EQ(gop_info->gop_global_index, toy_rate_ctrl->gop_global_index);
+  EXPECT_EQ(gop_info->frames_since_key, toy_rate_ctrl->frames_since_key);
+  EXPECT_EQ(gop_info->show_index, toy_rate_ctrl->show_index);
+  EXPECT_EQ(gop_info->coding_index, toy_rate_ctrl->coding_index);
+
+  gop_decision->gop_coding_frames = gop_info->gop_global_index == 0 ? 4 : 1;
+  gop_decision->use_alt_ref = 0;
+  toy_rate_ctrl->frames_since_key +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  toy_rate_ctrl->show_index +=
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref;
+  ++toy_rate_ctrl->gop_global_index;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_update_encodeframe_result(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_result_t *encode_frame_result) {
@@ -153,6 +619,43 @@ vpx_rc_status_t rc_update_encodeframe_result(
   return VPX_RC_OK;
 }
 
+vpx_rc_status_t rc_update_encodeframe_result_gop(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 640 * 360 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_update_encodeframe_result_gop_short(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_result_t *encode_frame_result) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+
+  const int64_t ref_pixel_count = 352 * 288 * 3 / 2;
+  EXPECT_EQ(encode_frame_result->pixel_count, ref_pixel_count);
+  return VPX_RC_OK;
+}
+
+vpx_rc_status_t rc_get_default_frame_rdmult(
+    vpx_rc_model_t rate_ctrl_model,
+    const vpx_rc_encodeframe_info_t *encode_frame_info, int *rdmult) {
+  const ToyRateCtrl *toy_rate_ctrl =
+      static_cast<ToyRateCtrl *>(rate_ctrl_model);
+  EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
+  EXPECT_LT(encode_frame_info->show_index, kFrameNumGOPShort);
+  EXPECT_EQ(encode_frame_info->coding_index, toy_rate_ctrl->coding_index);
+
+  *rdmult = VPX_DEFAULT_RDMULT;
+  return VPX_RC_OK;
+}
+
 vpx_rc_status_t rc_delete_model(vpx_rc_model_t rate_ctrl_model) {
   ToyRateCtrl *toy_rate_ctrl = static_cast<ToyRateCtrl *>(rate_ctrl_model);
   EXPECT_EQ(toy_rate_ctrl->magic_number, kModelMagicNumber);
@@ -176,6 +679,7 @@ class ExtRateCtrlTest : public ::libvpx_test::EncoderTest,
                           ::libvpx_test::Encoder *encoder) override {
     if (video->frame() == 0) {
       vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_QP;
       rc_funcs.create_model = rc_create_model;
       rc_funcs.send_firstpass_stats = rc_send_firstpass_stats;
       rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision;
@@ -195,8 +699,266 @@ TEST_F(ExtRateCtrlTest, EncodeTest) {
       "bus_352x288_420_f20_b8.yuv", VPX_IMG_FMT_I420, 352, 288, 30, 1, 0,
       kFrameNum));
 
-  ASSERT_NE(video.get(), nullptr);
+  ASSERT_NE(video, nullptr);
   ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
 }
 
+class ExtRateCtrlTestGOP : public ::libvpx_test::EncoderTest,
+                           public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOP() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOP() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop;
+      rc_funcs.get_gop_decision = rc_get_gop_decision;
+      rc_funcs.update_encodeframe_result = rc_update_encodeframe_result_gop;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOP, EncodeTest) {
+  cfg_.rc_target_bitrate = 4000;
+  cfg_.g_lag_in_frames = kMaxLagInFrames;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      "noisy_clip_640_360.y4m", VPX_IMG_FMT_I420, 640, 360, 30, 1, 0,
+      kFrameNumGOP));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShort : public ::libvpx_test::EncoderTest,
+                                public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShort() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShort() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShort, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortOverlay
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortOverlay() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortOverlay() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_overlay;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_overlay;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortOverlay, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestGOPShortNoARF
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWithParam<int> {
+ protected:
+  ExtRateCtrlTestGOPShortNoARF() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestGOPShortNoARF() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, kDefaultMinGfInterval);
+      encoder->Control(VP9E_SET_MAX_GF_INTERVAL, kDefaultMaxGfInterval);
+      encoder->Control(VP9E_SET_TARGET_LEVEL, vp9::LEVEL_AUTO);
+
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision =
+          rc_get_encodeframe_decision_gop_short_no_arf;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short_no_arf;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+};
+
+TEST_F(ExtRateCtrlTestGOPShortNoARF, EncodeTest) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+}
+
+class ExtRateCtrlTestRdmult : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  ExtRateCtrlTestRdmult() : EncoderTest(&::libvpx_test::kVP9) {}
+
+  ~ExtRateCtrlTestRdmult() override = default;
+
+  void SetUp() override {
+    InitializeConfig();
+    SetMode(::libvpx_test::kTwoPassGood);
+  }
+
+  void BeginPassHook(unsigned int) override {
+    psnr_ = 0.0;
+    nframes_ = 0;
+  }
+
+  void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) override {
+    psnr_ += pkt->data.psnr.psnr[0];
+    nframes_++;
+  }
+
+  void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                          ::libvpx_test::Encoder *encoder) override {
+    if (video->frame() == 0) {
+      vpx_rc_funcs_t rc_funcs;
+      rc_funcs.rc_type = VPX_RC_GOP_QP_RDMULT;
+      rc_funcs.create_model = rc_create_model_gop_short;
+      rc_funcs.send_firstpass_stats = rc_send_firstpass_stats_gop_short;
+      rc_funcs.get_encodeframe_decision = rc_get_encodeframe_decision_gop_short;
+      rc_funcs.get_gop_decision = rc_get_gop_decision_short;
+      rc_funcs.update_encodeframe_result =
+          rc_update_encodeframe_result_gop_short;
+      rc_funcs.get_frame_rdmult = rc_get_default_frame_rdmult;
+      rc_funcs.delete_model = rc_delete_model;
+      rc_funcs.priv = reinterpret_cast<void *>(PrivMagicNumber);
+      encoder->Control(VP9E_SET_EXTERNAL_RATE_CONTROL, &rc_funcs);
+    }
+  }
+
+  double GetAveragePsnr() const {
+    if (nframes_) return psnr_ / nframes_;
+    return 0.0;
+  }
+
+ private:
+  double psnr_;
+  unsigned int nframes_;
+};
+
+TEST_F(ExtRateCtrlTestRdmult, DefaultRdmult) {
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_lag_in_frames = kMaxLagInFrames - 1;
+  cfg_.rc_end_usage = VPX_VBR;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+
+  std::unique_ptr<libvpx_test::VideoSource> video;
+  video.reset(new (std::nothrow) libvpx_test::YUVVideoSource(
+      kTestFileName, VPX_IMG_FMT_I420, 352, 288, 30, 1, 0, kFrameNumGOPShort));
+
+  ASSERT_NE(video, nullptr);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video.get()));
+
+  const double psnr = GetAveragePsnr();
+  EXPECT_GT(psnr, kPsnrThreshold);
+}
+
 }  // namespace
diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc
index ca1062a76..587cec692 100644
--- a/libvpx/test/vp9_quantize_test.cc
+++ b/libvpx/test/vp9_quantize_test.cc
@@ -67,6 +67,45 @@ void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
   fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
+void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
+                          int16_t *quant, int16_t *quant_shift,
+                          int16_t *dequant, int16_t *round_fp,
+                          int16_t *quant_fp) {
+  // Max when q == 0. Otherwise, it is 48 for Y and 42 for U/V.
+  constexpr int kMaxQRoundingFactorFp = 64;
+
+  for (int j = 0; j < 2; j++) {
+    // The range is 4 to 1828 in the VP9 tables.
+    const int qlookup = rnd->RandRange(1825) + 4;
+    round_fp[j] = (kMaxQRoundingFactorFp * qlookup) >> 7;
+    quant_fp[j] = (1 << 16) / qlookup;
+
+    // Values determined by deconstructing vp9_init_quantizer().
+    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
+    // values or U/V values of any bit depth. This is because y_delta is not
+    // factored into the vp9_ac_quant() call.
+    zbin[j] = rnd->RandRange(1200);
+
+    // round may be up to 685 for Y values or 914 for U/V.
+    round[j] = rnd->RandRange(914);
+    // quant ranges from 1 to -32703
+    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
+    // quant_shift goes up to 1 << 16.
+    quant_shift[j] = rnd->RandRange(16384);
+    // dequant maxes out at 1828 for all cases.
+    dequant[j] = rnd->RandRange(1828);
+  }
+  for (int j = 2; j < 8; j++) {
+    zbin[j] = zbin[1];
+    round_fp[j] = round_fp[1];
+    quant_fp[j] = quant_fp[1];
+    round[j] = round[1];
+    quant[j] = quant[1];
+    quant_shift[j] = quant_shift[1];
+    dequant[j] = dequant[1];
+  }
+}
+
 class VP9QuantizeBase : public AbstractBench {
  public:
   VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
@@ -148,6 +187,7 @@ class VP9QuantizeTest : public VP9QuantizeBase,
 
  protected:
   virtual void Run();
+  void Speed(bool is_median);
   const QuantizeFunc quantize_op_;
   const QuantizeFunc ref_quantize_op_;
 };
@@ -159,6 +199,101 @@ void VP9QuantizeTest::Run() {
                scan_->iscan);
 }
 
+void VP9QuantizeTest::Speed(bool is_median) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  ASSERT_TRUE(coeff_.Init());
+  ASSERT_TRUE(qcoeff_.Init());
+  ASSERT_TRUE(dqcoeff_.Init());
+  TX_SIZE starting_sz, ending_sz;
+
+  if (max_size_ == 16) {
+    starting_sz = TX_4X4;
+    ending_sz = TX_16X16;
+  } else {
+    starting_sz = TX_32X32;
+    ending_sz = TX_32X32;
+  }
+
+  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
+    // zbin > coeff, zbin < coeff.
+    for (int i = 0; i < 2; ++i) {
+      // TX_TYPE defines the scan order. That is not relevant to the speed test.
+      // Pick the first one.
+      const TX_TYPE tx_type = DCT_DCT;
+      count_ = (4 << sz) * (4 << sz);
+      scan_ = &vp9_scan_orders[sz][tx_type];
+
+      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
+                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
+                           quant_fp_ptr_);
+
+      if (i == 0) {
+        // When |coeff values| are less than zbin the results are 0.
+        int threshold = 100;
+        if (max_size_ == 32) {
+          // For 32x32, the threshold is halved. Double it to keep the values
+          // from clearing it.
+          threshold = 200;
+        }
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
+        coeff_.Set(&rnd, -99, 99);
+      } else if (i == 1) {
+        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
+        coeff_.Set(&rnd, -500, 500);
+      }
+
+      const char *type =
+          (i == 0) ? "Bypass calculations " : "Full calculations ";
+      char block_size[16];
+      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+      char title[100];
+      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+
+      if (is_median) {
+        RunNTimes(10000000 / count_);
+        PrintMedian(title);
+      } else {
+        Buffer<tran_low_t> ref_qcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_qcoeff.Init());
+        Buffer<tran_low_t> ref_dqcoeff =
+            Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
+        ASSERT_TRUE(ref_dqcoeff.Init());
+        uint16_t ref_eob = 0;
+
+        const int kNumTests = 5000000;
+        vpx_usec_timer timer, simd_timer;
+
+        vpx_usec_timer_start(&timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_,
+                           q_ptr_, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                           ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                           scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&timer);
+
+        vpx_usec_timer_start(&simd_timer);
+        for (int n = 0; n < kNumTests; ++n) {
+          quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                       quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+                       dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_,
+                       scan_->scan, scan_->iscan);
+        }
+        vpx_usec_timer_mark(&simd_timer);
+
+        const int elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&timer));
+        const int simd_elapsed_time =
+            static_cast<int>(vpx_usec_timer_elapsed(&simd_timer));
+        printf("%s c_time = %d \t simd_time = %d \t Gain = %f \n", title,
+               elapsed_time, simd_elapsed_time,
+               ((float)elapsed_time / simd_elapsed_time));
+      }
+    }
+  }
+}
+
 // This quantizer compares the AC coefficients to the quantization step size to
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
@@ -254,45 +389,6 @@ void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 
-void GenerateHelperArrays(ACMRandom *rnd, int16_t *zbin, int16_t *round,
-                          int16_t *quant, int16_t *quant_shift,
-                          int16_t *dequant, int16_t *round_fp,
-                          int16_t *quant_fp) {
-  // Max when q == 0.  Otherwise, it is 48 for Y and 42 for U/V.
-  const int max_qrounding_factor_fp = 64;
-
-  for (int j = 0; j < 2; j++) {
-    // The range is 4 to 1828 in the VP9 tables.
-    const int qlookup = rnd->RandRange(1825) + 4;
-    round_fp[j] = (max_qrounding_factor_fp * qlookup) >> 7;
-    quant_fp[j] = (1 << 16) / qlookup;
-
-    // Values determined by deconstructing vp9_init_quantizer().
-    // zbin may be up to 1143 for 8 and 10 bit Y values, or 1200 for 12 bit Y
-    // values or U/V values of any bit depth. This is because y_delta is not
-    // factored into the vp9_ac_quant() call.
-    zbin[j] = rnd->RandRange(1200);
-
-    // round may be up to 685 for Y values or 914 for U/V.
-    round[j] = rnd->RandRange(914);
-    // quant ranges from 1 to -32703
-    quant[j] = static_cast<int>(rnd->RandRange(32704)) - 32703;
-    // quant_shift goes up to 1 << 16.
-    quant_shift[j] = rnd->RandRange(16384);
-    // dequant maxes out at 1828 for all cases.
-    dequant[j] = rnd->RandRange(1828);
-  }
-  for (int j = 2; j < 8; j++) {
-    zbin[j] = zbin[1];
-    round_fp[j] = round_fp[1];
-    quant_fp[j] = quant_fp[1];
-    round[j] = round[1];
-    quant[j] = quant[1];
-    quant_shift[j] = quant_shift[1];
-    dequant[j] = dequant[1];
-  }
-}
-
 TEST_P(VP9QuantizeTest, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   ASSERT_TRUE(coeff_.Init());
@@ -403,60 +499,9 @@ TEST_P(VP9QuantizeTest, EOBCheck) {
   }
 }
 
-TEST_P(VP9QuantizeTest, DISABLED_Speed) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  ASSERT_TRUE(coeff_.Init());
-  ASSERT_TRUE(qcoeff_.Init());
-  ASSERT_TRUE(dqcoeff_.Init());
-  TX_SIZE starting_sz, ending_sz;
-
-  if (max_size_ == 16) {
-    starting_sz = TX_4X4;
-    ending_sz = TX_16X16;
-  } else {
-    starting_sz = TX_32X32;
-    ending_sz = TX_32X32;
-  }
-
-  for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
-    // zbin > coeff, zbin < coeff.
-    for (int i = 0; i < 2; ++i) {
-      // TX_TYPE defines the scan order. That is not relevant to the speed test.
-      // Pick the first one.
-      const TX_TYPE tx_type = DCT_DCT;
-      count_ = (4 << sz) * (4 << sz);
-      scan_ = &vp9_scan_orders[sz][tx_type];
-
-      GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
-                           quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
-                           quant_fp_ptr_);
-
-      if (i == 0) {
-        // When |coeff values| are less than zbin the results are 0.
-        int threshold = 100;
-        if (max_size_ == 32) {
-          // For 32x32, the threshold is halved. Double it to keep the values
-          // from clearing it.
-          threshold = 200;
-        }
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = threshold;
-        coeff_.Set(&rnd, -99, 99);
-      } else if (i == 1) {
-        for (int j = 0; j < 8; ++j) zbin_ptr_[j] = 50;
-        coeff_.Set(&rnd, -500, 500);
-      }
+TEST_P(VP9QuantizeTest, DISABLED_Speed) { Speed(false); }
 
-      RunNTimes(10000000 / count_);
-      const char *type =
-          (i == 0) ? "Bypass calculations " : "Full calculations ";
-      char block_size[16];
-      snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
-      char title[100];
-      snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
-      PrintMedian(title);
-    }
-  }
-}
+TEST_P(VP9QuantizeTest, DISABLED_SpeedMedian) { Speed(true); }
 
 using std::make_tuple;
 
@@ -467,6 +512,8 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Values(
         make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
                    false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_sse2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
                    VPX_BITS_8, 16, false),
         make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
@@ -492,7 +539,6 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
-#if VPX_ARCH_X86_64
 INSTANTIATE_TEST_SUITE_P(
     SSSE3, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
@@ -506,16 +552,6 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_ssse3>,
                                  &QuantFPWrapper<quantize_fp_32x32_nz_c>,
                                  VPX_BITS_8, 32, true)));
-#else
-INSTANTIATE_TEST_SUITE_P(
-    SSSE3, VP9QuantizeTest,
-    ::testing::Values(make_tuple(&vpx_quantize_b_ssse3, &vpx_quantize_b_c,
-                                 VPX_BITS_8, 16, false),
-                      make_tuple(&vpx_quantize_b_32x32_ssse3,
-                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
-                                 false)));
-
-#endif  // VPX_ARCH_X86_64
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
@@ -529,14 +565,78 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
+                   &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_c>, VPX_BITS_12, 16,
+                   true),
+        make_tuple(&QuantFPWrapper<vp9_highbd_quantize_fp_32x32_avx2>,
+                   &QuantFPWrapper<vp9_highbd_quantize_fp_32x32_c>, VPX_BITS_12,
+                   32, true),
+        make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_avx2, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx2, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_avx2,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     AVX2, VP9QuantizeTest,
     ::testing::Values(make_tuple(&QuantFPWrapper<vp9_quantize_fp_avx2>,
                                  &QuantFPWrapper<quantize_fp_nz_c>, VPX_BITS_8,
-                                 16, true)));
+                                 16, true),
+                      make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_avx2>,
+                                 &QuantFPWrapper<quantize_fp_32x32_nz_c>,
+                                 VPX_BITS_8, 32, true),
+                      make_tuple(&vpx_quantize_b_avx2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_32x32_avx2,
+                                 &vpx_quantize_b_32x32_c, VPX_BITS_8, 32,
+                                 false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    NEON, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_8, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_10, 16, false),
+        make_tuple(&vpx_highbd_quantize_b_neon, &vpx_highbd_quantize_b_c,
+                   VPX_BITS_12, 16, false),
+        make_tuple(&vpx_quantize_b_32x32_neon, &vpx_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
+        make_tuple(&vpx_highbd_quantize_b_32x32_neon,
+                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_c>, VPX_BITS_8, 16, true),
+        make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
+                   &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
+                   true)));
+#else
 INSTANTIATE_TEST_SUITE_P(
     NEON, VP9QuantizeTest,
     ::testing::Values(make_tuple(&vpx_quantize_b_neon, &vpx_quantize_b_c,
@@ -550,6 +650,7 @@ INSTANTIATE_TEST_SUITE_P(
                       make_tuple(&QuantFPWrapper<vp9_quantize_fp_32x32_neon>,
                                  &QuantFPWrapper<vp9_quantize_fp_32x32_c>,
                                  VPX_BITS_8, 32, true)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_NEON
 
 #if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/test/vp9_ratectrl_rtc_test.cc b/libvpx/test/vp9_ratectrl_rtc_test.cc
index b09a45bb7..1d1a78f43 100644
--- a/libvpx/test/vp9_ratectrl_rtc_test.cc
+++ b/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -26,7 +26,11 @@ namespace {
 
 const size_t kNumFrames = 300;
 
-const int kTemporalId[4] = { 0, 2, 1, 2 };
+const int kTemporalId3Layer[4] = { 0, 2, 1, 2 };
+const int kTemporalId2Layer[2] = { 0, 1 };
+const int kTemporalRateAllocation3Layer[3] = { 50, 70, 100 };
+const int kTemporalRateAllocation2Layer[2] = { 60, 100 };
+const int kSpatialLayerBitrate[3] = { 200, 400, 1000 };
 
 class RcInterfaceTest
     : public ::libvpx_test::EncoderTest,
@@ -179,28 +183,73 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
       encoder->Control(VP9E_SET_SVC, 1);
       encoder->Control(VP9E_SET_SVC_PARAMETERS, &svc_params_);
     }
-
-    frame_params_.frame_type = video->frame() == 0 ? KEY_FRAME : INTER_FRAME;
-    if (rc_cfg_.rc_mode == VPX_CBR && frame_params_.frame_type == INTER_FRAME) {
-      // Disable golden frame update.
-      frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
-      frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
-    }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
     encoder_exit_ = video->frame() == kNumFrames;
     current_superframe_ = video->frame();
+    if (dynamic_spatial_layers_ == 1) {
+      if (video->frame() == 100) {
+        // Go down to 2 spatial layers: set top SL to 0 bitrate.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[8];
+        cfg_.layer_target_bitrate[6] = 0;
+        cfg_.layer_target_bitrate[7] = 0;
+        cfg_.layer_target_bitrate[8] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[8];
+        rc_cfg_.layer_target_bitrate[6] = 0;
+        rc_cfg_.layer_target_bitrate[7] = 0;
+        rc_cfg_.layer_target_bitrate[8] = 0;
+        rc_api_->UpdateRateControl(rc_cfg_);
+      } else if (video->frame() == 200) {
+        // Go down to 1 spatial layer.
+        // Update the encoder config.
+        cfg_.rc_target_bitrate -= cfg_.layer_target_bitrate[5];
+        cfg_.layer_target_bitrate[3] = 0;
+        cfg_.layer_target_bitrate[4] = 0;
+        cfg_.layer_target_bitrate[5] = 0;
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        rc_cfg_.target_bandwidth -= rc_cfg_.layer_target_bitrate[5];
+        rc_cfg_.layer_target_bitrate[3] = 0;
+        rc_cfg_.layer_target_bitrate[4] = 0;
+        rc_cfg_.layer_target_bitrate[5] = 0;
+        rc_api_->UpdateRateControl(rc_cfg_);
+      } else if (0 && video->frame() == 280) {
+        // TODO(marpan): Re-enable this going back up when issue is fixed.
+        // Go back up to 3 spatial layers.
+        // Update the encoder config: use the original bitrates.
+        SetEncoderConfigSvc(3, 3);
+        encoder->Config(&cfg_);
+        // Update the RC config.
+        SetRCConfigSvc(3, 3);
+        rc_api_->UpdateRateControl(rc_cfg_);
+      }
+    }
   }
 
   virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
     ::libvpx_test::CxDataIterator iter = encoder->GetCxData();
+    for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) sizes_[sl] = 0;
     while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
       ParseSuperframeSizes(static_cast<const uint8_t *>(pkt->data.frame.buf),
                            pkt->data.frame.sz);
       for (int sl = 0; sl < rc_cfg_.ss_number_layers; sl++) {
-        frame_params_.spatial_layer_id = sl;
-        frame_params_.temporal_layer_id = kTemporalId[current_superframe_ % 4];
-        rc_api_->ComputeQP(frame_params_);
-        frame_params_.frame_type = INTER_FRAME;
-        rc_api_->PostEncodeUpdate(sizes_[sl]);
+        if (sizes_[sl] > 0) {
+          frame_params_.spatial_layer_id = sl;
+          if (rc_cfg_.ts_number_layers == 3)
+            frame_params_.temporal_layer_id =
+                kTemporalId3Layer[current_superframe_ % 4];
+          else if (rc_cfg_.ts_number_layers == 2)
+            frame_params_.temporal_layer_id =
+                kTemporalId2Layer[current_superframe_ % 2];
+          else
+            frame_params_.temporal_layer_id = 0;
+          rc_api_->ComputeQP(frame_params_);
+          frame_params_.frame_type = INTER_FRAME;
+          rc_api_->PostEncodeUpdate(sizes_[sl]);
+        }
       }
     }
     if (!encoder_exit_) {
@@ -218,9 +267,37 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
                             const vpx_image_t * /*img2*/) {}
 
   void RunSvc() {
-    SetConfigSvc();
+    dynamic_spatial_layers_ = 0;
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 10000;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcPeriodicKey() {
+    dynamic_spatial_layers_ = 0;
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 100;
+    rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
+    SetEncoderConfigSvc(3, 3);
+
+    ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
+                                         1280, 720, 30, 1, 0, kNumFrames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunSvcDynamicSpatial() {
+    dynamic_spatial_layers_ = 1;
+    SetRCConfigSvc(3, 3);
+    key_interval_ = 10000;
     rc_api_ = libvpx::VP9RateControlRTC::Create(rc_cfg_);
-    SetEncoderSvc();
+    SetEncoderConfigSvc(3, 3);
 
     ::libvpx_test::I420VideoSource video("desktop_office1.1280_720-020.yuv",
                                          1280, 720, 30, 1, 0, kNumFrames);
@@ -256,30 +333,54 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     return VPX_CODEC_OK;
   }
 
-  void SetEncoderSvc() {
-    cfg_.ss_number_layers = 3;
-    cfg_.ts_number_layers = 3;
+  void SetEncoderConfigSvc(int number_spatial_layers,
+                           int number_temporal_layers) {
+    cfg_.g_w = 1280;
+    cfg_.g_h = 720;
+    cfg_.ss_number_layers = number_spatial_layers;
+    cfg_.ts_number_layers = number_temporal_layers;
     cfg_.g_timebase.num = 1;
     cfg_.g_timebase.den = 30;
-    svc_params_.scaling_factor_num[0] = 72;
-    svc_params_.scaling_factor_den[0] = 288;
-    svc_params_.scaling_factor_num[1] = 144;
-    svc_params_.scaling_factor_den[1] = 288;
-    svc_params_.scaling_factor_num[2] = 288;
-    svc_params_.scaling_factor_den[2] = 288;
+    if (number_spatial_layers == 3) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 4;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 4;
+      svc_params_.scaling_factor_num[2] = 4;
+      svc_params_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 2;
+      svc_params_.scaling_factor_num[1] = 2;
+      svc_params_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      svc_params_.scaling_factor_num[0] = 1;
+      svc_params_.scaling_factor_den[0] = 1;
+    }
+
     for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
       svc_params_.max_quantizers[i] = 56;
       svc_params_.min_quantizers[i] = 2;
       svc_params_.speed_per_layer[i] = 7;
+      svc_params_.loopfilter_ctrl[i] = LOOPFILTER_ALL;
     }
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
     cfg_.g_error_resilient = 0;
-    // 3 temporal layers
-    cfg_.ts_rate_decimator[0] = 4;
-    cfg_.ts_rate_decimator[1] = 2;
-    cfg_.ts_rate_decimator[2] = 1;
-    cfg_.temporal_layering_mode = 3;
+
+    if (number_temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.temporal_layering_mode = 3;
+    } else if (number_temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.temporal_layering_mode = 2;
+    } else if (number_temporal_layers == 1) {
+      cfg_.ts_rate_decimator[0] = 1;
+      cfg_.temporal_layering_mode = 0;
+    }
 
     cfg_.rc_buf_initial_sz = 500;
     cfg_.rc_buf_optimal_sz = 600;
@@ -288,27 +389,39 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     cfg_.rc_max_quantizer = 56;
     cfg_.g_threads = 1;
     cfg_.kf_max_dist = 9999;
-    cfg_.rc_target_bitrate = 1600;
     cfg_.rc_overshoot_pct = 50;
     cfg_.rc_undershoot_pct = 50;
 
-    cfg_.layer_target_bitrate[0] = 100;
-    cfg_.layer_target_bitrate[1] = 140;
-    cfg_.layer_target_bitrate[2] = 200;
-    cfg_.layer_target_bitrate[3] = 250;
-    cfg_.layer_target_bitrate[4] = 350;
-    cfg_.layer_target_bitrate[5] = 500;
-    cfg_.layer_target_bitrate[6] = 450;
-    cfg_.layer_target_bitrate[7] = 630;
-    cfg_.layer_target_bitrate[8] = 900;
+    cfg_.rc_target_bitrate = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      cfg_.rc_target_bitrate += spatial_bitrate;
+    }
+
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
   }
 
-  void SetConfigSvc() {
+  void SetRCConfigSvc(int number_spatial_layers, int number_temporal_layers) {
     rc_cfg_.width = 1280;
     rc_cfg_.height = 720;
+    rc_cfg_.ss_number_layers = number_spatial_layers;
+    rc_cfg_.ts_number_layers = number_temporal_layers;
     rc_cfg_.max_quantizer = 56;
     rc_cfg_.min_quantizer = 2;
-    rc_cfg_.target_bandwidth = 1600;
     rc_cfg_.buf_initial_sz = 500;
     rc_cfg_.buf_optimal_sz = 600;
     rc_cfg_.buf_sz = 1000;
@@ -316,31 +429,55 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
     rc_cfg_.overshoot_pct = 50;
     rc_cfg_.max_intra_bitrate_pct = 900;
     rc_cfg_.framerate = 30.0;
-    rc_cfg_.ss_number_layers = 3;
-    rc_cfg_.ts_number_layers = 3;
     rc_cfg_.rc_mode = VPX_CBR;
     rc_cfg_.aq_mode = aq_mode_;
 
-    rc_cfg_.scaling_factor_num[0] = 1;
-    rc_cfg_.scaling_factor_den[0] = 4;
-    rc_cfg_.scaling_factor_num[1] = 2;
-    rc_cfg_.scaling_factor_den[1] = 4;
-    rc_cfg_.scaling_factor_num[2] = 4;
-    rc_cfg_.scaling_factor_den[2] = 4;
-
-    rc_cfg_.ts_rate_decimator[0] = 4;
-    rc_cfg_.ts_rate_decimator[1] = 2;
-    rc_cfg_.ts_rate_decimator[2] = 1;
-
-    rc_cfg_.layer_target_bitrate[0] = 100;
-    rc_cfg_.layer_target_bitrate[1] = 140;
-    rc_cfg_.layer_target_bitrate[2] = 200;
-    rc_cfg_.layer_target_bitrate[3] = 250;
-    rc_cfg_.layer_target_bitrate[4] = 350;
-    rc_cfg_.layer_target_bitrate[5] = 500;
-    rc_cfg_.layer_target_bitrate[6] = 450;
-    rc_cfg_.layer_target_bitrate[7] = 630;
-    rc_cfg_.layer_target_bitrate[8] = 900;
+    if (number_spatial_layers == 3) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 4;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 4;
+      rc_cfg_.scaling_factor_num[2] = 4;
+      rc_cfg_.scaling_factor_den[2] = 4;
+    } else if (number_spatial_layers == 2) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 2;
+      rc_cfg_.scaling_factor_num[1] = 2;
+      rc_cfg_.scaling_factor_den[1] = 2;
+    } else if (number_spatial_layers == 1) {
+      rc_cfg_.scaling_factor_num[0] = 1;
+      rc_cfg_.scaling_factor_den[0] = 1;
+    }
+
+    if (number_temporal_layers == 3) {
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    } else if (number_temporal_layers == 2) {
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (number_temporal_layers == 1) {
+      rc_cfg_.ts_rate_decimator[0] = 1;
+    }
+
+    rc_cfg_.target_bandwidth = 0;
+    for (int sl = 0; sl < number_spatial_layers; sl++) {
+      int spatial_bitrate = 0;
+      if (number_spatial_layers <= 3)
+        spatial_bitrate = kSpatialLayerBitrate[sl];
+      for (int tl = 0; tl < number_temporal_layers; tl++) {
+        int layer = sl * number_temporal_layers + tl;
+        if (number_temporal_layers == 3)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation3Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 2)
+          rc_cfg_.layer_target_bitrate[layer] =
+              kTemporalRateAllocation2Layer[tl] * spatial_bitrate / 100;
+        else if (number_temporal_layers == 1)
+          rc_cfg_.layer_target_bitrate[layer] = spatial_bitrate;
+      }
+      rc_cfg_.target_bandwidth += spatial_bitrate;
+    }
 
     for (int sl = 0; sl < rc_cfg_.ss_number_layers; ++sl) {
       for (int tl = 0; tl < rc_cfg_.ts_number_layers; ++tl) {
@@ -359,6 +496,8 @@ class RcInterfaceSvcTest : public ::libvpx_test::EncoderTest,
   bool encoder_exit_;
   int current_superframe_;
   uint32_t sizes_[8];
+  int key_interval_;
+  int dynamic_spatial_layers_;
 };
 
 TEST_P(RcInterfaceTest, OneLayer) { RunOneLayer(); }
@@ -367,6 +506,10 @@ TEST_P(RcInterfaceTest, OneLayerVBRPeriodicKey) { RunOneLayerVBRPeriodicKey(); }
 
 TEST_P(RcInterfaceSvcTest, Svc) { RunSvc(); }
 
+TEST_P(RcInterfaceSvcTest, SvcPeriodicKey) { RunSvcPeriodicKey(); }
+
+TEST_P(RcInterfaceSvcTest, SvcDynamicSpatial) { RunSvcDynamicSpatial(); }
+
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index 211cc6c7a..a57082f1e 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <tuple>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -17,9 +18,11 @@
 #include "test/bench.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/util.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/vpx_timer.h"
 
 typedef void (*SubtractFunc)(int rows, int cols, int16_t *diff_ptr,
                              ptrdiff_t diff_stride, const uint8_t *src_ptr,
@@ -133,6 +136,10 @@ INSTANTIATE_TEST_SUITE_P(C, VP9SubtractBlockTest,
 INSTANTIATE_TEST_SUITE_P(SSE2, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_sse2));
 #endif
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(AVX2, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_avx2));
+#endif
 #if HAVE_NEON
 INSTANTIATE_TEST_SUITE_P(NEON, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_neon));
@@ -157,4 +164,158 @@ INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
                          ::testing::Values(vpx_subtract_block_lsx));
 #endif
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void (*HBDSubtractFunc)(int rows, int cols, int16_t *diff_ptr,
+                                ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                                ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                                ptrdiff_t pred_stride, int bd);
+
+// <BLOCK_SIZE, bit_depth, optimized subtract func, reference subtract func>
+using Params = std::tuple<BLOCK_SIZE, int, HBDSubtractFunc, HBDSubtractFunc>;
+
+class VPXHBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = 4 * num_4x4_blocks_wide_lookup[GET_PARAM(0)];
+    block_height_ = 4 * num_4x4_blocks_high_lookup[GET_PARAM(0)];
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(1));
+    func_ = GET_PARAM(2);
+    ref_func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    constexpr size_t kMaxWidth = 128;
+    constexpr size_t kMaxBlockSize = kMaxWidth * kMaxWidth;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(src_, nullptr);
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(uint16_t))));
+    ASSERT_NE(pred_, nullptr);
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, kMaxBlockSize * sizeof(int16_t)));
+    ASSERT_NE(diff_, nullptr);
+  }
+
+  virtual void TearDown() {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void CheckResult();
+  void RunForSpeed();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  HBDSubtractFunc ref_func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VPXHBDSubtractBlockTest::CheckResult() {
+  constexpr int kTestNum = 100;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+  for (int i = 0; i < kTestNum; ++i) {
+    for (int j = 0; j < kMaxBlockSize; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VPXHBDSubtractBlockTest, CheckResult) { CheckResult(); }
+
+void VPXHBDSubtractBlockTest::RunForSpeed() {
+  constexpr int kTestNum = 200000;
+  constexpr int kMaxWidth = 128;
+  constexpr int kMaxBlockSize = kMaxWidth * kMaxWidth;
+  const int mask = (1 << bit_depth_) - 1;
+
+  if (ref_func_ == func_) GTEST_SKIP();
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer ref_timer;
+  vpx_usec_timer_start(&ref_timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    ref_func_(block_height_, block_width_, diff_, block_width_, src_,
+              block_width_, pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&ref_timer);
+  const int64_t ref_elapsed_time = vpx_usec_timer_elapsed(&ref_timer);
+
+  for (int j = 0; j < kMaxBlockSize; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  vpx_usec_timer timer;
+  vpx_usec_timer_start(&timer);
+  for (int i = 0; i < kTestNum; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_, src_, block_width_,
+          pred_, block_width_, bit_depth_);
+  }
+  vpx_usec_timer_mark(&timer);
+  const int64_t elapsed_time = vpx_usec_timer_elapsed(&timer);
+
+  printf(
+      "[%dx%d]: "
+      "ref_time=%6" PRId64 " \t simd_time=%6" PRId64
+      " \t "
+      "gain=%f \n",
+      block_width_, block_height_, ref_elapsed_time, elapsed_time,
+      static_cast<double>(ref_elapsed_time) /
+          static_cast<double>(elapsed_time));
+}
+
+TEST_P(VPXHBDSubtractBlockTest, DISABLED_Speed) { RunForSpeed(); }
+
+const BLOCK_SIZE kValidBlockSize[] = { BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
+                                       BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
+                                       BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
+                                       BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
+                                       BLOCK_64X64 };
+
+INSTANTIATE_TEST_SUITE_P(
+    C, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_c),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+
+#if HAVE_AVX2
+INSTANTIATE_TEST_SUITE_P(
+    AVX2, VPXHBDSubtractBlockTest,
+    ::testing::Combine(::testing::ValuesIn(kValidBlockSize),
+                       ::testing::Values(12),
+                       ::testing::Values(&vpx_highbd_subtract_block_avx2),
+                       ::testing::Values(&vpx_highbd_subtract_block_c)));
+#endif  // HAVE_AVX2
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace vp9
diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx
index b9a74922f..5f6b01b0e 100644
--- a/libvpx/third_party/googletest/README.libvpx
+++ b/libvpx/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest.git
-Version: release-1.11.0
+Version: release-1.12.1
 License: BSD
 License File: LICENSE
 
@@ -13,9 +13,17 @@ generation.
 
 Local Modifications:
 - Remove everything but:
+  .clang-format
   CONTRIBUTORS
   googletest/
    include
    README.md
    src
   LICENSE
+- Move .clang-format, CONTRIBUTORS, and LICENSE into googletest/
+- In googletest/include/gtest/internal/custom/gtest-port.h, define
+  GTEST_HAS_NOTIFICATION_ as 1 and use a stub Notification class to fix
+  the mingw32 g++ compilation errors caused by the lack of std::mutex
+  and std::condition_variable in the <mutex> and <condition_variable>
+  headers if mingw32 is configured with the win32 threads option. See
+  https://stackoverflow.com/questions/17242516/mingw-w64-threads-posix-vs-win32
diff --git a/libvpx/third_party/googletest/src/.clang-format b/libvpx/third_party/googletest/src/.clang-format
new file mode 100644
index 000000000..5b9bfe6d2
--- /dev/null
+++ b/libvpx/third_party/googletest/src/.clang-format
@@ -0,0 +1,4 @@
+# Run manually to reformat a file:
+# clang-format -i --style=file <file>
+Language:        Cpp
+BasedOnStyle:  Google
diff --git a/libvpx/third_party/googletest/src/CONTRIBUTORS b/libvpx/third_party/googletest/src/CONTRIBUTORS
index 76db0b40f..77397a5b5 100644
--- a/libvpx/third_party/googletest/src/CONTRIBUTORS
+++ b/libvpx/third_party/googletest/src/CONTRIBUTORS
@@ -34,6 +34,7 @@ Manuel Klimek <klimek@google.com>
 Mario Tanev <radix@google.com>
 Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Martijn Vels <mvels@google.com>
 Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
 Mike Bland <mbland@google.com>
@@ -55,6 +56,7 @@ Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
 Sverre Sundsdal <sundsdal@gmail.com>
+Szymon Sobik <sobik.szymon@gmail.com>
 Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
diff --git a/libvpx/third_party/googletest/src/README.md b/libvpx/third_party/googletest/src/README.md
index 1f8b349ae..d26b309ed 100644
--- a/libvpx/third_party/googletest/src/README.md
+++ b/libvpx/third_party/googletest/src/README.md
@@ -25,7 +25,7 @@ When building GoogleTest as a standalone project, the typical workflow starts
 with
 
 ```
-git clone https://github.com/google/googletest.git -b release-1.10.0
+git clone https://github.com/google/googletest.git -b release-1.11.0
 cd googletest        # Main directory of the cloned repository.
 mkdir build          # Create a directory to hold the build output.
 cd build
@@ -94,7 +94,7 @@ include(FetchContent)
 FetchContent_Declare(
   googletest
   # Specify the commit you depend on and update it regularly.
-  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
+  URL https://github.com/google/googletest/archive/e2239ee6043f73722e7aa812a459f54a28552929.zip
 )
 # For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
@@ -203,7 +203,9 @@ add
     -DGTEST_DONT_DEFINE_FOO=1
 
 to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
-to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
+to `GTEST_FOO`. Currently `FOO` can be `ASSERT_EQ`, `ASSERT_FALSE`, `ASSERT_GE`,
+`ASSERT_GT`, `ASSERT_LE`, `ASSERT_LT`, `ASSERT_NE`, `ASSERT_TRUE`,
+`EXPECT_FALSE`, `EXPECT_TRUE`, `FAIL`, `SUCCEED`, `TEST`, or `TEST_F`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
     GTEST_TEST(SomeTest, DoesThis) { ... }
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h b/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h
new file mode 100644
index 000000000..addbb59c6
--- /dev/null
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-assertion-result.h
@@ -0,0 +1,237 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file implements the AssertionResult type.
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
+
+#include <memory>
+#include <ostream>
+#include <string>
+#include <type_traits>
+
+#include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
+
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251                                   \
+/* class A needs to have dll-interface to be used by clients of class B */)
+
+namespace testing {
+
+// A class for indicating whether an assertion was successful.  When
+// the assertion wasn't successful, the AssertionResult object
+// remembers a non-empty message that describes how it failed.
+//
+// To create an instance of this class, use one of the factory functions
+// (AssertionSuccess() and AssertionFailure()).
+//
+// This class is useful for two purposes:
+//   1. Defining predicate functions to be used with Boolean test assertions
+//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
+//   2. Defining predicate-format functions to be
+//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
+//
+// For example, if you define IsEven predicate:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
+// will print the message
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false (5 is odd)
+//   Expected: true
+//
+// instead of a more opaque
+//
+//   Value of: IsEven(Fib(5))
+//     Actual: false
+//   Expected: true
+//
+// in case IsEven is a simple Boolean predicate.
+//
+// If you expect your predicate to be reused and want to support informative
+// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
+// about half as often as positive ones in our tests), supply messages for
+// both success and failure cases:
+//
+//   testing::AssertionResult IsEven(int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess() << n << " is even";
+//     else
+//       return testing::AssertionFailure() << n << " is odd";
+//   }
+//
+// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
+//
+//   Value of: IsEven(Fib(6))
+//     Actual: true (8 is even)
+//   Expected: false
+//
+// NB: Predicates that support negative Boolean assertions have reduced
+// performance in positive ones so be careful not to use them in tests
+// that have lots (tens of thousands) of positive Boolean assertions.
+//
+// To use this class with EXPECT_PRED_FORMAT assertions such as:
+//
+//   // Verifies that Foo() returns an even number.
+//   EXPECT_PRED_FORMAT1(IsEven, Foo());
+//
+// you need to define:
+//
+//   testing::AssertionResult IsEven(const char* expr, int n) {
+//     if ((n % 2) == 0)
+//       return testing::AssertionSuccess();
+//     else
+//       return testing::AssertionFailure()
+//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
+//   }
+//
+// If Foo() returns 5, you will see the following message:
+//
+//   Expected: Foo() is even
+//     Actual: it's 5
+//
+class GTEST_API_ AssertionResult {
+ public:
+  // Copy constructor.
+  // Used in EXPECT_TRUE/FALSE(assertion_result).
+  AssertionResult(const AssertionResult& other);
+
+// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
+// This warning is not emitted in Visual Studio 2017.
+// This warning is off by default starting in Visual Studio 2019 but can be
+// enabled with command-line options.
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
+#endif
+
+  // Used in the EXPECT_TRUE/FALSE(bool_expression).
+  //
+  // T must be contextually convertible to bool.
+  //
+  // The second parameter prevents this overload from being considered if
+  // the argument is implicitly convertible to AssertionResult. In that case
+  // we want AssertionResult's copy constructor to be used.
+  template <typename T>
+  explicit AssertionResult(
+      const T& success,
+      typename std::enable_if<
+          !std::is_convertible<T, AssertionResult>::value>::type*
+      /*enabler*/
+      = nullptr)
+      : success_(success) {}
+
+#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
+  GTEST_DISABLE_MSC_WARNINGS_POP_()
+#endif
+
+  // Assignment operator.
+  AssertionResult& operator=(AssertionResult other) {
+    swap(other);
+    return *this;
+  }
+
+  // Returns true if and only if the assertion succeeded.
+  operator bool() const { return success_; }  // NOLINT
+
+  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+  AssertionResult operator!() const;
+
+  // Returns the text streamed into this AssertionResult. Test assertions
+  // use it when they fail (i.e., the predicate's outcome doesn't match the
+  // assertion's expectation). When nothing has been streamed into the
+  // object, returns an empty string.
+  const char* message() const {
+    return message_.get() != nullptr ? message_->c_str() : "";
+  }
+  // Deprecated; please use message() instead.
+  const char* failure_message() const { return message(); }
+
+  // Streams a custom failure message into this object.
+  template <typename T>
+  AssertionResult& operator<<(const T& value) {
+    AppendMessage(Message() << value);
+    return *this;
+  }
+
+  // Allows streaming basic output manipulators such as endl or flush into
+  // this object.
+  AssertionResult& operator<<(
+      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
+    AppendMessage(Message() << basic_manipulator);
+    return *this;
+  }
+
+ private:
+  // Appends the contents of message to message_.
+  void AppendMessage(const Message& a_message) {
+    if (message_.get() == nullptr) message_.reset(new ::std::string);
+    message_->append(a_message.GetString().c_str());
+  }
+
+  // Swap the contents of this AssertionResult with other.
+  void swap(AssertionResult& other);
+
+  // Stores result of the assertion predicate.
+  bool success_;
+  // Stores the message describing the condition in case the expectation
+  // construct is not satisfied with the predicate's outcome.
+  // Referenced via a pointer to avoid taking too much stack frame space
+  // with test assertions.
+  std::unique_ptr< ::std::string> message_;
+};
+
+// Makes a successful assertion result.
+GTEST_API_ AssertionResult AssertionSuccess();
+
+// Makes a failed assertion result.
+GTEST_API_ AssertionResult AssertionFailure();
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << msg.
+GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
+
+}  // namespace testing
+
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_ASSERTION_RESULT_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
index 9b4d4d133..84e5a5bbd 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -27,21 +27,21 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for death tests.  It is
 // #included by gtest.h so a user doesn't need to include this
 // directly.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
-namespace testing {
-
 // This flag controls the style of death tests.  Valid values are "threadsafe",
 // meaning that the death test child process will re-execute the test binary
 // from the start, running only a single death test, or "fast",
@@ -49,6 +49,8 @@ namespace testing {
 // after forking.
 GTEST_DECLARE_string_(death_test_style);
 
+namespace testing {
+
 #if GTEST_HAS_DEATH_TEST
 
 namespace internal {
@@ -103,7 +105,6 @@ GTEST_API_ bool InDeathTestChild();
 //
 // On the regular expressions used in death tests:
 //
-//   GOOGLETEST_CM0005 DO NOT DELETE
 //   On POSIX-compliant systems (*nix), we use the <regex.h> library,
 //   which uses the POSIX extended regex syntax.
 //
@@ -169,24 +170,24 @@ GTEST_API_ bool InDeathTestChild();
 // Asserts that a given `statement` causes the program to exit, with an
 // integer exit status that satisfies `predicate`, and emitting error output
 // that matches `matcher`.
-# define ASSERT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
+#define ASSERT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
 // Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, matcher) \
-    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_EXIT(statement, predicate, matcher) \
+  GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
 // Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
 // signal, and emitting error output that matches `matcher`.
-# define ASSERT_DEATH(statement, matcher) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define ASSERT_DEATH(statement, matcher) \
+  ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, matcher) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
+#define EXPECT_DEATH(statement, matcher) \
+  EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -197,22 +198,23 @@ class GTEST_API_ ExitedWithCode {
   ExitedWithCode(const ExitedWithCode&) = default;
   void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
+
  private:
   const int exit_code_;
 };
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Tests that an exit code describes an exit due to termination by a
 // given signal.
-// GOOGLETEST_CM0006 DO NOT DELETE
 class GTEST_API_ KilledBySignal {
  public:
   explicit KilledBySignal(int signum);
   bool operator()(int exit_status) const;
+
  private:
   const int signum_;
 };
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode.
 // The death testing framework causes this to have interesting semantics,
@@ -257,23 +259,21 @@ class GTEST_API_ KilledBySignal {
 //   EXPECT_EQ(12, DieInDebugOr12(&sideeffect));
 // }, "death");
 //
-# ifdef NDEBUG
+#ifdef NDEBUG
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
+#define EXPECT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
+#define ASSERT_DEBUG_DEATH(statement, regex) \
   GTEST_EXECUTE_STATEMENT_(statement, regex)
 
-# else
+#else
 
-#  define EXPECT_DEBUG_DEATH(statement, regex) \
-  EXPECT_DEATH(statement, regex)
+#define EXPECT_DEBUG_DEATH(statement, regex) EXPECT_DEATH(statement, regex)
 
-#  define ASSERT_DEBUG_DEATH(statement, regex) \
-  ASSERT_DEATH(statement, regex)
+#define ASSERT_DEBUG_DEATH(statement, regex) ASSERT_DEATH(statement, regex)
 
-# endif  // NDEBUG for EXPECT_DEBUG_DEATH
+#endif  // NDEBUG for EXPECT_DEBUG_DEATH
 #endif  // GTEST_HAS_DEATH_TEST
 
 // This macro is used for implementing macros such as
@@ -311,18 +311,17 @@ class GTEST_API_ KilledBySignal {
 //  statement unconditionally returns or throws. The Message constructor at
 //  the end allows the syntax of streaming additional messages into the
 //  macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH.
-# define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::AlwaysTrue()) { \
-      GTEST_LOG_(WARNING) \
-          << "Death tests are not supported on this platform.\n" \
-          << "Statement '" #statement "' cannot be verified."; \
-    } else if (::testing::internal::AlwaysFalse()) { \
-      ::testing::internal::RE::PartialMatch(".*", (regex)); \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-      terminator; \
-    } else \
-      ::testing::Message()
+#define GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, terminator)             \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
+    GTEST_LOG_(WARNING) << "Death tests are not supported on this platform.\n" \
+                        << "Statement '" #statement "' cannot be verified.";   \
+  } else if (::testing::internal::AlwaysFalse()) {                             \
+    ::testing::internal::RE::PartialMatch(".*", (regex));                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    terminator;                                                                \
+  } else                                                                       \
+    ::testing::Message()
 
 // EXPECT_DEATH_IF_SUPPORTED(statement, regex) and
 // ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if
@@ -330,15 +329,15 @@ class GTEST_API_ KilledBySignal {
 // useful when you are combining death test assertions with normal test
 // assertions in one test.
 #if GTEST_HAS_DEATH_TEST
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    EXPECT_DEATH(statement, regex)
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    ASSERT_DEATH(statement, regex)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  EXPECT_DEATH(statement, regex)
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  ASSERT_DEATH(statement, regex)
 #else
-# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
-# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
-    GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
+#define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, )
+#define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \
+  GTEST_UNSUPPORTED_DEATH_TEST(statement, regex, return)
 #endif
 
 }  // namespace testing
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
index 9fa34a05b..bffa00c53 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,6 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
@@ -98,11 +102,11 @@ class MatchResultListener {
  private:
   ::std::ostream* const stream_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatchResultListener);
+  MatchResultListener(const MatchResultListener&) = delete;
+  MatchResultListener& operator=(const MatchResultListener&) = delete;
 };
 
-inline MatchResultListener::~MatchResultListener() {
-}
+inline MatchResultListener::~MatchResultListener() {}
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
@@ -176,27 +180,39 @@ namespace internal {
 
 struct AnyEq {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a == b; }
+  bool operator()(const A& a, const B& b) const {
+    return a == b;
+  }
 };
 struct AnyNe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a != b; }
+  bool operator()(const A& a, const B& b) const {
+    return a != b;
+  }
 };
 struct AnyLt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a < b; }
+  bool operator()(const A& a, const B& b) const {
+    return a < b;
+  }
 };
 struct AnyGt {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a > b; }
+  bool operator()(const A& a, const B& b) const {
+    return a > b;
+  }
 };
 struct AnyLe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a <= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a <= b;
+  }
 };
 struct AnyGe {
   template <typename A, typename B>
-  bool operator()(const A& a, const B& b) const { return a >= b; }
+  bool operator()(const A& a, const B& b) const {
+    return a >= b;
+  }
 };
 
 // A match result listener that ignores the explanation.
@@ -205,7 +221,8 @@ class DummyMatchResultListener : public MatchResultListener {
   DummyMatchResultListener() : MatchResultListener(nullptr) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DummyMatchResultListener);
+  DummyMatchResultListener(const DummyMatchResultListener&) = delete;
+  DummyMatchResultListener& operator=(const DummyMatchResultListener&) = delete;
 };
 
 // A match result listener that forwards the explanation to a given
@@ -217,7 +234,9 @@ class StreamMatchResultListener : public MatchResultListener {
       : MatchResultListener(os) {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
+  StreamMatchResultListener(const StreamMatchResultListener&) = delete;
+  StreamMatchResultListener& operator=(const StreamMatchResultListener&) =
+      delete;
 };
 
 struct SharedPayloadBase {
@@ -284,17 +303,18 @@ class MatcherBase : private MatcherDescriberInterface {
   }
 
  protected:
-  MatcherBase() : vtable_(nullptr) {}
+  MatcherBase() : vtable_(nullptr), buffer_() {}
 
   // Constructs a matcher from its implementation.
   template <typename U>
-  explicit MatcherBase(const MatcherInterface<U>* impl) {
+  explicit MatcherBase(const MatcherInterface<U>* impl)
+      : vtable_(nullptr), buffer_() {
     Init(impl);
   }
 
   template <typename M, typename = typename std::remove_reference<
                             M>::type::is_gtest_matcher>
-  MatcherBase(M&& m) {  // NOLINT
+  MatcherBase(M&& m) : vtable_(nullptr), buffer_() {  // NOLINT
     Init(std::forward<M>(m));
   }
 
@@ -420,8 +440,8 @@ class MatcherBase : private MatcherDescriberInterface {
     static const M& Get(const MatcherBase& m) {
       // When inlined along with Init, need to be explicit to avoid violating
       // strict aliasing rules.
-      const M *ptr = static_cast<const M*>(
-          static_cast<const void*>(&m.buffer_));
+      const M* ptr =
+          static_cast<const M*>(static_cast<const void*>(&m.buffer_));
       return *ptr;
     }
     static void Init(MatcherBase& m, M impl) {
@@ -741,7 +761,7 @@ template <typename Rhs>
 class EqMatcher : public ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq> {
  public:
   explicit EqMatcher(const Rhs& rhs)
-      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) { }
+      : ComparisonBase<EqMatcher<Rhs>, Rhs, AnyEq>(rhs) {}
   static const char* Desc() { return "is equal to"; }
   static const char* NegatedDesc() { return "isn't equal to"; }
 };
@@ -749,7 +769,7 @@ template <typename Rhs>
 class NeMatcher : public ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe> {
  public:
   explicit NeMatcher(const Rhs& rhs)
-      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) { }
+      : ComparisonBase<NeMatcher<Rhs>, Rhs, AnyNe>(rhs) {}
   static const char* Desc() { return "isn't equal to"; }
   static const char* NegatedDesc() { return "is equal to"; }
 };
@@ -757,7 +777,7 @@ template <typename Rhs>
 class LtMatcher : public ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt> {
  public:
   explicit LtMatcher(const Rhs& rhs)
-      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) { }
+      : ComparisonBase<LtMatcher<Rhs>, Rhs, AnyLt>(rhs) {}
   static const char* Desc() { return "is <"; }
   static const char* NegatedDesc() { return "isn't <"; }
 };
@@ -765,7 +785,7 @@ template <typename Rhs>
 class GtMatcher : public ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt> {
  public:
   explicit GtMatcher(const Rhs& rhs)
-      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) { }
+      : ComparisonBase<GtMatcher<Rhs>, Rhs, AnyGt>(rhs) {}
   static const char* Desc() { return "is >"; }
   static const char* NegatedDesc() { return "isn't >"; }
 };
@@ -773,7 +793,7 @@ template <typename Rhs>
 class LeMatcher : public ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe> {
  public:
   explicit LeMatcher(const Rhs& rhs)
-      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) { }
+      : ComparisonBase<LeMatcher<Rhs>, Rhs, AnyLe>(rhs) {}
   static const char* Desc() { return "is <="; }
   static const char* NegatedDesc() { return "isn't <="; }
 };
@@ -781,7 +801,7 @@ template <typename Rhs>
 class GeMatcher : public ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe> {
  public:
   explicit GeMatcher(const Rhs& rhs)
-      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) { }
+      : ComparisonBase<GeMatcher<Rhs>, Rhs, AnyGe>(rhs) {}
   static const char* Desc() { return "is >="; }
   static const char* NegatedDesc() { return "isn't >="; }
 };
@@ -872,12 +892,16 @@ PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
 // Note: if the parameter of Eq() were declared as const T&, Eq("foo")
 // wouldn't compile.
 template <typename T>
-inline internal::EqMatcher<T> Eq(T x) { return internal::EqMatcher<T>(x); }
+inline internal::EqMatcher<T> Eq(T x) {
+  return internal::EqMatcher<T>(x);
+}
 
 // Constructs a Matcher<T> from a 'value' of type T.  The constructed
 // matcher matches any value that's equal to 'value'.
 template <typename T>
-Matcher<T>::Matcher(T value) { *this = Eq(value); }
+Matcher<T>::Matcher(T value) {
+  *this = Eq(value);
+}
 
 // Creates a monomorphic matcher that matches anything with type Lhs
 // and equal to rhs.  A user may need to use this instead of Eq(...)
@@ -892,7 +916,9 @@ Matcher<T>::Matcher(T value) { *this = Eq(value); }
 // can always write Matcher<T>(Lt(5)) to be explicit about the type,
 // for example.
 template <typename Lhs, typename Rhs>
-inline Matcher<Lhs> TypedEq(const Rhs& rhs) { return Eq(rhs); }
+inline Matcher<Lhs> TypedEq(const Rhs& rhs) {
+  return Eq(rhs);
+}
 
 // Creates a polymorphic matcher that matches anything >= x.
 template <typename Rhs>
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
index becfd49fc..6c8bf9000 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the Message class.
@@ -42,7 +41,9 @@
 // to CHANGE WITHOUT NOTICE.  Therefore DO NOT DEPEND ON IT in a user
 // program!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
@@ -110,8 +111,8 @@ class GTEST_API_ Message {
 
   // Streams a non-pointer value to this object.
   template <typename T>
-  inline Message& operator <<(const T& val) {
-    // Some libraries overload << for STL containers.  These
+  inline Message& operator<<(const T& val) {
+        // Some libraries overload << for STL containers.  These
     // overloads are defined in the global namespace instead of ::std.
     //
     // C++'s symbol lookup rule (i.e. Koenig lookup) says that these
@@ -125,7 +126,7 @@ class GTEST_API_ Message {
     // from the global namespace.  With this using declaration,
     // overloads of << defined in the global namespace and those
     // visible via Koenig lookup are both exposed in this function.
-    using ::operator <<;
+    using ::operator<<;
     *ss_ << val;
     return *this;
   }
@@ -144,7 +145,7 @@ class GTEST_API_ Message {
   // ensure consistent result across compilers, we always treat NULL
   // as "(null)".
   template <typename T>
-  inline Message& operator <<(T* const& pointer) {  // NOLINT
+  inline Message& operator<<(T* const& pointer) {  // NOLINT
     if (pointer == nullptr) {
       *ss_ << "(null)";
     } else {
@@ -159,25 +160,23 @@ class GTEST_API_ Message {
   // templatized version above.  Without this definition, streaming
   // endl or other basic IO manipulators to Message will confuse the
   // compiler.
-  Message& operator <<(BasicNarrowIoManip val) {
+  Message& operator<<(BasicNarrowIoManip val) {
     *ss_ << val;
     return *this;
   }
 
   // Instead of 1/0, we want to see true/false for bool values.
-  Message& operator <<(bool b) {
-    return *this << (b ? "true" : "false");
-  }
+  Message& operator<<(bool b) { return *this << (b ? "true" : "false"); }
 
   // These two overloads allow streaming a wide C string to a Message
   // using the UTF-8 encoding.
-  Message& operator <<(const wchar_t* wide_c_str);
-  Message& operator <<(wchar_t* wide_c_str);
+  Message& operator<<(const wchar_t* wide_c_str);
+  Message& operator<<(wchar_t* wide_c_str);
 
 #if GTEST_HAS_STD_WSTRING
   // Converts the given wide string to a narrow string using the UTF-8
   // encoding, and streams the result to this Message object.
-  Message& operator <<(const ::std::wstring& wstr);
+  Message& operator<<(const ::std::wstring& wstr);
 #endif  // GTEST_HAS_STD_WSTRING
 
   // Gets the text streamed to this object so far as an std::string.
@@ -196,7 +195,7 @@ class GTEST_API_ Message {
 };
 
 // Streams a Message to an ostream.
-inline std::ostream& operator <<(std::ostream& os, const Message& sb) {
+inline std::ostream& operator<<(std::ostream& os, const Message& sb) {
   return os << sb.GetString();
 }
 
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
index 804e70281..b55119ac6 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -26,11 +26,14 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
+
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
@@ -353,9 +356,7 @@ internal::ValueArray<T...> Values(T... v) {
 // }
 // INSTANTIATE_TEST_SUITE_P(BoolSequence, FlagDependentTest, Bool());
 //
-inline internal::ParamGenerator<bool> Bool() {
-  return Values(false, true);
-}
+inline internal::ParamGenerator<bool> Bool() { return Values(false, true); }
 
 // Combine() allows the user to combine two or more sequences to produce
 // values of a Cartesian product of those sequences' elements.
@@ -428,8 +429,11 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,    \
-                                                           test_name));        \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
   };                                                                           \
   int GTEST_TEST_CLASS_NAME_(test_suite_name,                                  \
                              test_name)::gtest_registering_dummy_ =            \
@@ -453,43 +457,42 @@ internal::CartesianProductHolder<Generator...> Combine(const Generator&... g) {
 #define GTEST_GET_FIRST_(first, ...) first
 #define GTEST_GET_SECOND_(first, second, ...) second
 
-#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)                \
-  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>      \
-      gtest_##prefix##test_suite_name##_EvalGenerator_() {                    \
-    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));        \
-  }                                                                           \
-  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(   \
-      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {     \
-    if (::testing::internal::AlwaysFalse()) {                                 \
-      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(      \
-          __VA_ARGS__,                                                        \
-          ::testing::internal::DefaultParamName<test_suite_name::ParamType>,  \
-          DUMMY_PARAM_)));                                                    \
-      auto t = std::make_tuple(__VA_ARGS__);                                  \
-      static_assert(std::tuple_size<decltype(t)>::value <= 2,                 \
-                    "Too Many Args!");                                        \
-    }                                                                         \
-    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                 \
-        __VA_ARGS__,                                                          \
-        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,    \
-        DUMMY_PARAM_))))(info);                                               \
-  }                                                                           \
-  static int gtest_##prefix##test_suite_name##_dummy_                         \
-      GTEST_ATTRIBUTE_UNUSED_ =                                               \
-          ::testing::UnitTest::GetInstance()                                  \
-              ->parameterized_test_registry()                                 \
-              .GetTestSuitePatternHolder<test_suite_name>(                    \
-                  GTEST_STRINGIFY_(test_suite_name),                          \
-                  ::testing::internal::CodeLocation(__FILE__, __LINE__))      \
-              ->AddTestSuiteInstantiation(                                    \
-                  GTEST_STRINGIFY_(prefix),                                   \
-                  &gtest_##prefix##test_suite_name##_EvalGenerator_,          \
-                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,       \
+#define INSTANTIATE_TEST_SUITE_P(prefix, test_suite_name, ...)               \
+  static ::testing::internal::ParamGenerator<test_suite_name::ParamType>     \
+      gtest_##prefix##test_suite_name##_EvalGenerator_() {                   \
+    return GTEST_EXPAND_(GTEST_GET_FIRST_(__VA_ARGS__, DUMMY_PARAM_));       \
+  }                                                                          \
+  static ::std::string gtest_##prefix##test_suite_name##_EvalGenerateName_(  \
+      const ::testing::TestParamInfo<test_suite_name::ParamType>& info) {    \
+    if (::testing::internal::AlwaysFalse()) {                                \
+      ::testing::internal::TestNotEmpty(GTEST_EXPAND_(GTEST_GET_SECOND_(     \
+          __VA_ARGS__,                                                       \
+          ::testing::internal::DefaultParamName<test_suite_name::ParamType>, \
+          DUMMY_PARAM_)));                                                   \
+      auto t = std::make_tuple(__VA_ARGS__);                                 \
+      static_assert(std::tuple_size<decltype(t)>::value <= 2,                \
+                    "Too Many Args!");                                       \
+    }                                                                        \
+    return ((GTEST_EXPAND_(GTEST_GET_SECOND_(                                \
+        __VA_ARGS__,                                                         \
+        ::testing::internal::DefaultParamName<test_suite_name::ParamType>,   \
+        DUMMY_PARAM_))))(info);                                              \
+  }                                                                          \
+  static int gtest_##prefix##test_suite_name##_dummy_                        \
+      GTEST_ATTRIBUTE_UNUSED_ =                                              \
+          ::testing::UnitTest::GetInstance()                                 \
+              ->parameterized_test_registry()                                \
+              .GetTestSuitePatternHolder<test_suite_name>(                   \
+                  GTEST_STRINGIFY_(test_suite_name),                         \
+                  ::testing::internal::CodeLocation(__FILE__, __LINE__))     \
+              ->AddTestSuiteInstantiation(                                   \
+                  GTEST_STRINGIFY_(prefix),                                  \
+                  &gtest_##prefix##test_suite_name##_EvalGenerator_,         \
+                  &gtest_##prefix##test_suite_name##_EvalGenerateName_,      \
                   __FILE__, __LINE__)
 
-
 // Allow Marking a Parameterized test class as not needing to be instantiated.
-#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                   \
+#define GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(T)                  \
   namespace gtest_do_not_use_outside_namespace_scope {}                   \
   static const ::testing::internal::MarkAsIgnored gtest_allow_ignore_##T( \
       GTEST_STRINGIFY_(T))
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
index 076c9de1f..a91e8b8b1 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -95,7 +94,9 @@
 // being defined as many user-defined container types don't have
 // value_type.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
@@ -257,12 +258,10 @@ struct ConvertibleToStringViewPrinter {
 #endif
 };
 
-
 // Prints the given number of bytes in the given object to the given
 // ostream.
 GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+                                     size_t count, ::std::ostream* os);
 struct RawBytesPrinter {
   // SFINAE on `sizeof` to make sure we have a complete type.
   template <typename T, size_t = sizeof(T)>
@@ -360,7 +359,7 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
-#ifdef __cpp_char8_t
+#ifdef __cpp_lib_char8_t
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
 #endif
@@ -375,12 +374,12 @@ GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 // to point to a NUL-terminated string, and thus can print it as a string.
 
 #define GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(CharType, OtherStringType) \
-  template <>                                                           \
-  class FormatForComparison<CharType*, OtherStringType> {               \
-   public:                                                              \
-    static ::std::string Format(CharType* value) {                      \
-      return ::testing::PrintToString(value);                           \
-    }                                                                   \
+  template <>                                                            \
+  class FormatForComparison<CharType*, OtherStringType> {                \
+   public:                                                               \
+    static ::std::string Format(CharType* value) {                       \
+      return ::testing::PrintToString(value);                            \
+    }                                                                    \
   }
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
@@ -410,8 +409,8 @@ GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const wchar_t, ::std::wstring);
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 template <typename T1, typename T2>
-std::string FormatForComparisonFailureMessage(
-    const T1& value, const T2& /* other_operand */) {
+std::string FormatForComparisonFailureMessage(const T1& value,
+                                              const T2& /* other_operand */) {
   return FormatForComparison<T1, T2>::Format(value);
 }
 
@@ -479,6 +478,12 @@ inline void PrintTo(char8_t c, ::std::ostream* os) {
 }
 #endif
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+GTEST_API_ void PrintTo(__uint128_t v, ::std::ostream* os);
+GTEST_API_ void PrintTo(__int128_t v, ::std::ostream* os);
+#endif  // __SIZEOF_INT128__
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -545,7 +550,7 @@ void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) {
 }
 
 // Overloads for ::std::string.
-GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os);
+GTEST_API_ void PrintStringTo(const ::std::string& s, ::std::ostream* os);
 inline void PrintTo(const ::std::string& s, ::std::ostream* os) {
   PrintStringTo(s, os);
 }
@@ -572,7 +577,7 @@ inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
 
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
+GTEST_API_ void PrintWideStringTo(const ::std::wstring& s, ::std::ostream* os);
 inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) {
   PrintWideStringTo(s, os);
 }
@@ -587,6 +592,12 @@ inline void PrintTo(internal::StringView sp, ::std::ostream* os) {
 
 inline void PrintTo(std::nullptr_t, ::std::ostream* os) { *os << "(nullptr)"; }
 
+#if GTEST_HAS_RTTI
+inline void PrintTo(const std::type_info& info, std::ostream* os) {
+  *os << internal::GetTypeName(info);
+}
+#endif  // GTEST_HAS_RTTI
+
 template <typename T>
 void PrintTo(std::reference_wrapper<T> ref, ::std::ostream* os) {
   UniversalPrinter<T&>::Print(ref.get(), os);
@@ -744,6 +755,14 @@ class UniversalPrinter<Optional<T>> {
   }
 };
 
+template <>
+class UniversalPrinter<decltype(Nullopt())> {
+ public:
+  static void Print(decltype(Nullopt()), ::std::ostream* os) {
+    *os << "(nullopt)";
+  }
+};
+
 #endif  // GTEST_INTERNAL_HAS_OPTIONAL
 
 #if GTEST_INTERNAL_HAS_VARIANT
@@ -802,8 +821,8 @@ void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) {
   }
 }
 // This overload prints a (const) char array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const char* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const char* begin, size_t len,
+                                    ::std::ostream* os);
 
 #ifdef __cpp_char8_t
 // This overload prints a (const) char8_t array compactly.
@@ -820,8 +839,8 @@ GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
                                     ::std::ostream* os);
 
 // This overload prints a (const) wchar_t array compactly.
-GTEST_API_ void UniversalPrintArray(
-    const wchar_t* begin, size_t len, ::std::ostream* os);
+GTEST_API_ void UniversalPrintArray(const wchar_t* begin, size_t len,
+                                    ::std::ostream* os);
 
 // Implements printing an array type T[N].
 template <typename T, size_t N>
@@ -980,10 +999,10 @@ void UniversalPrint(const T& value, ::std::ostream* os) {
   UniversalPrinter<T1>::Print(value, os);
 }
 
-typedef ::std::vector< ::std::string> Strings;
+typedef ::std::vector<::std::string> Strings;
 
-  // Tersely prints the first N fields of a tuple to a string vector,
-  // one element for each field.
+// Tersely prints the first N fields of a tuple to a string vector,
+// one element for each field.
 template <typename Tuple>
 void TersePrintPrefixToStrings(const Tuple&, std::integral_constant<size_t, 0>,
                                Strings*) {}
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
index eacef4466..bec8c4810 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -27,12 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // Utilities for testing Google Test itself and code that uses Google Test
 // (e.g. frameworks built on top of Google Test).
 
-// GOOGLETEST_CM0004 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
@@ -88,7 +85,10 @@ class GTEST_API_ ScopedFakeTestPartResultReporter
   TestPartResultReporterInterface* old_reporter_;
   TestPartResultArray* const result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter);
+  ScopedFakeTestPartResultReporter(const ScopedFakeTestPartResultReporter&) =
+      delete;
+  ScopedFakeTestPartResultReporter& operator=(
+      const ScopedFakeTestPartResultReporter&) = delete;
 };
 
 namespace internal {
@@ -104,12 +104,14 @@ class GTEST_API_ SingleFailureChecker {
   SingleFailureChecker(const TestPartResultArray* results,
                        TestPartResult::Type type, const std::string& substr);
   ~SingleFailureChecker();
+
  private:
   const TestPartResultArray* const results_;
   const TestPartResult::Type type_;
   const std::string substr_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker);
+  SingleFailureChecker(const SingleFailureChecker&) = delete;
+  SingleFailureChecker& operator=(const SingleFailureChecker&) = delete;
 };
 
 }  // namespace internal
@@ -119,7 +121,8 @@ class GTEST_API_ SingleFailureChecker {
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
 // A set of macros for testing Google Test assertions or code that's expected
-// to generate Google Test fatal failures.  It verifies that the given
+// to generate Google Test fatal failures (e.g. a failure from an ASSERT_EQ, but
+// not a non-fatal failure, as from EXPECT_EQ).  It verifies that the given
 // statement will cause exactly one fatal Google Test failure with 'substr'
 // being part of the failure message.
 //
@@ -141,44 +144,46 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // helper macro, due to some peculiarity in how the preprocessor
 // works.  The AcceptsMacroThatExpandsToUnprotectedComma test in
 // gtest_unittest.cc will fail to compile if we do that.
-#define EXPECT_FATAL_FAILURE(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE(statement, substr)                               \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::                       \
+              INTERCEPT_ONLY_CURRENT_THREAD,                                  \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do { \
-    class GTestExpectFatalFailureHelper {\
-     public:\
-      static void Execute() { statement; }\
-    };\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ALL_THREADS, &gtest_failures);\
-      GTestExpectFatalFailureHelper::Execute();\
-    }\
+#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr)                \
+  do {                                                                        \
+    class GTestExpectFatalFailureHelper {                                     \
+     public:                                                                  \
+      static void Execute() { statement; }                                    \
+    };                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kFatalFailure, (substr)); \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
+          ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
+          &gtest_failures);                                                   \
+      GTestExpectFatalFailureHelper::Execute();                               \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 // A macro for testing Google Test assertions or code that's expected to
-// generate Google Test non-fatal failures.  It asserts that the given
-// statement will cause exactly one non-fatal Google Test failure with 'substr'
-// being part of the failure message.
+// generate Google Test non-fatal failures (e.g. a failure from an EXPECT_EQ,
+// but not from an ASSERT_EQ). It asserts that the given statement will cause
+// exactly one non-fatal Google Test failure with 'substr' being part of the
+// failure message.
 //
 // There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only
 // affects and considers failures generated in the current thread and
@@ -207,32 +212,37 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // instead of
 //   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 // to avoid an MSVC warning on unreachable code.
-#define EXPECT_NONFATAL_FAILURE(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
+#define EXPECT_NONFATAL_FAILURE(statement, substr)                    \
+  do {                                                                \
+    ::testing::TestPartResultArray gtest_failures;                    \
+    ::testing::internal::SingleFailureChecker gtest_checker(          \
         &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
-          ::testing::ScopedFakeTestPartResultReporter:: \
-          INTERCEPT_ONLY_CURRENT_THREAD, &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+        (substr));                                                    \
+    {                                                                 \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(     \
+          ::testing::ScopedFakeTestPartResultReporter::               \
+              INTERCEPT_ONLY_CURRENT_THREAD,                          \
+          &gtest_failures);                                           \
+      if (::testing::internal::AlwaysTrue()) {                        \
+        statement;                                                    \
+      }                                                               \
+    }                                                                 \
   } while (::testing::internal::AlwaysFalse())
 
-#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \
-  do {\
-    ::testing::TestPartResultArray gtest_failures;\
-    ::testing::internal::SingleFailureChecker gtest_checker(\
-        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure, \
-        (substr));\
-    {\
-      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\
+#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr)             \
+  do {                                                                        \
+    ::testing::TestPartResultArray gtest_failures;                            \
+    ::testing::internal::SingleFailureChecker gtest_checker(                  \
+        &gtest_failures, ::testing::TestPartResult::kNonFatalFailure,         \
+        (substr));                                                            \
+    {                                                                         \
+      ::testing::ScopedFakeTestPartResultReporter gtest_reporter(             \
           ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS, \
-          &gtest_failures);\
-      if (::testing::internal::AlwaysTrue()) { statement; }\
-    }\
+          &gtest_failures);                                                   \
+      if (::testing::internal::AlwaysTrue()) {                                \
+        statement;                                                            \
+      }                                                                       \
+    }                                                                         \
   } while (::testing::internal::AlwaysFalse())
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
index 203fdf98c..09cc8c34f 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -26,14 +26,17 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 
@@ -142,7 +145,8 @@ class GTEST_API_ TestPartResultArray {
  private:
   std::vector<TestPartResult> array_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray);
+  TestPartResultArray(const TestPartResultArray&) = delete;
+  TestPartResultArray& operator=(const TestPartResultArray&) = delete;
 };
 
 // This interface knows how to report a test part result.
@@ -168,11 +172,13 @@ class GTEST_API_ HasNewFatalFailureHelper
   ~HasNewFatalFailureHelper() override;
   void ReportTestPartResult(const TestPartResult& result) override;
   bool has_new_fatal_failure() const { return has_new_fatal_failure_; }
+
  private:
   bool has_new_fatal_failure_;
   TestPartResultReporterInterface* original_reporter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper);
+  HasNewFatalFailureHelper(const HasNewFatalFailureHelper&) = delete;
+  HasNewFatalFailureHelper& operator=(const HasNewFatalFailureHelper&) = delete;
 };
 
 }  // namespace internal
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 9fdc6be10..bd35a3266 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -27,7 +27,9 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
@@ -190,7 +192,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   typedef ::testing::internal::GenerateTypeList<Types>::type            \
       GTEST_TYPE_PARAMS_(CaseName);                                     \
   typedef ::testing::internal::NameGeneratorSelector<__VA_ARGS__>::type \
-      GTEST_NAME_GENERATOR_(CaseName)
+  GTEST_NAME_GENERATOR_(CaseName)
 
 #define TYPED_TEST(CaseName, TestName)                                        \
   static_assert(sizeof(GTEST_STRINGIFY_(TestName)) > 1,                       \
@@ -256,7 +258,7 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
 // #included in multiple translation units linked together.
 #define TYPED_TEST_SUITE_P(SuiteName)              \
   static ::testing::internal::TypedTestSuitePState \
-      GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
+  GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName)
 
 // Legacy API is deprecated but still available
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
@@ -301,21 +303,21 @@ INSTANTIATE_TYPED_TEST_SUITE_P(My, FooTest, MyTypes);
   REGISTER_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)       \
-  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                       \
-                "test-suit-prefix must not be empty");                      \
-  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =        \
-      ::testing::internal::TypeParameterizedTestSuite<                      \
-          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,    \
-          ::testing::internal::GenerateTypeList<Types>::type>::             \
-          Register(GTEST_STRINGIFY_(Prefix),                                \
-                   ::testing::internal::CodeLocation(__FILE__, __LINE__),   \
-                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),             \
-                   GTEST_STRINGIFY_(SuiteName),                             \
-                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),                 \
-                   ::testing::internal::GenerateNames<                      \
-                       ::testing::internal::NameGeneratorSelector<          \
-                           __VA_ARGS__>::type,                              \
+#define INSTANTIATE_TYPED_TEST_SUITE_P(Prefix, SuiteName, Types, ...)     \
+  static_assert(sizeof(GTEST_STRINGIFY_(Prefix)) > 1,                     \
+                "test-suit-prefix must not be empty");                    \
+  static bool gtest_##Prefix##_##SuiteName GTEST_ATTRIBUTE_UNUSED_ =      \
+      ::testing::internal::TypeParameterizedTestSuite<                    \
+          SuiteName, GTEST_SUITE_NAMESPACE_(SuiteName)::gtest_AllTests_,  \
+          ::testing::internal::GenerateTypeList<Types>::type>::           \
+          Register(GTEST_STRINGIFY_(Prefix),                              \
+                   ::testing::internal::CodeLocation(__FILE__, __LINE__), \
+                   &GTEST_TYPED_TEST_SUITE_P_STATE_(SuiteName),           \
+                   GTEST_STRINGIFY_(SuiteName),                           \
+                   GTEST_REGISTERED_TEST_NAMES_(SuiteName),               \
+                   ::testing::internal::GenerateNames<                    \
+                       ::testing::internal::NameGeneratorSelector<        \
+                           __VA_ARGS__>::type,                            \
                        ::testing::internal::GenerateTypeList<Types>::type>())
 
 // Legacy API is deprecated but still available
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libvpx/third_party/googletest/src/include/gtest/gtest.h
index 7a5d057c4..d19a587a1 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the public API for Google Test.  It should be
@@ -47,8 +46,6 @@
 // registration from Barthelemy Dagenais' (barthelemy@prologique.com)
 // easyUnit framework.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
-
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
@@ -59,31 +56,22 @@
 #include <type_traits>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-string.h"
+#include "gtest/gtest-assertion-result.h"
 #include "gtest/gtest-death-test.h"
 #include "gtest/gtest-matchers.h"
 #include "gtest/gtest-message.h"
 #include "gtest/gtest-param-test.h"
 #include "gtest/gtest-printers.h"
-#include "gtest/gtest_prod.h"
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest-typed-test.h"
+#include "gtest/gtest_pred_impl.h"
+#include "gtest/gtest_prod.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-string.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
-// Silence C4100 (unreferenced formal parameter) and 4805
-// unsafe mix of type 'const int' and type 'const bool'
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable:4805)
-# pragma warning(disable:4100)
-#endif
-
-
 // Declares the flags.
 
 // This flag temporary enables the disabled tests.
@@ -138,6 +126,12 @@ GTEST_DECLARE_int32_(random_seed);
 // is 1. If the value is -1 the tests are repeating forever.
 GTEST_DECLARE_int32_(repeat);
 
+// This flag controls whether Google Test Environments are recreated for each
+// repeat of the tests. The default value is true. If set to false the global
+// test Environment objects are only set up once, for the first iteration, and
+// only torn down once, for the last.
+GTEST_DECLARE_bool_(recreate_environments_when_repeating);
+
 // This flag controls whether Google Test includes Google Test internal
 // stack frames in failure stack traces.
 GTEST_DECLARE_bool_(show_internal_stack_frames);
@@ -163,6 +157,16 @@ GTEST_DECLARE_string_(stream_result_to);
 GTEST_DECLARE_string_(flagfile);
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
+
+// Silence C4100 (unreferenced formal parameter) and 4805
+// unsafe mix of type 'const int' and type 'const bool'
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4805)
+#pragma warning(disable : 4100)
+#endif
+
 // The upper limit for valid stack trace depths.
 const int kMaxStackTraceDepth = 100;
 
@@ -201,193 +205,6 @@ using TestCase = TestSuite;
 class TestInfo;
 class UnitTest;
 
-// A class for indicating whether an assertion was successful.  When
-// the assertion wasn't successful, the AssertionResult object
-// remembers a non-empty message that describes how it failed.
-//
-// To create an instance of this class, use one of the factory functions
-// (AssertionSuccess() and AssertionFailure()).
-//
-// This class is useful for two purposes:
-//   1. Defining predicate functions to be used with Boolean test assertions
-//      EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts
-//   2. Defining predicate-format functions to be
-//      used with predicate assertions (ASSERT_PRED_FORMAT*, etc).
-//
-// For example, if you define IsEven predicate:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5)))
-// will print the message
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false (5 is odd)
-//   Expected: true
-//
-// instead of a more opaque
-//
-//   Value of: IsEven(Fib(5))
-//     Actual: false
-//   Expected: true
-//
-// in case IsEven is a simple Boolean predicate.
-//
-// If you expect your predicate to be reused and want to support informative
-// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up
-// about half as often as positive ones in our tests), supply messages for
-// both success and failure cases:
-//
-//   testing::AssertionResult IsEven(int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess() << n << " is even";
-//     else
-//       return testing::AssertionFailure() << n << " is odd";
-//   }
-//
-// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print
-//
-//   Value of: IsEven(Fib(6))
-//     Actual: true (8 is even)
-//   Expected: false
-//
-// NB: Predicates that support negative Boolean assertions have reduced
-// performance in positive ones so be careful not to use them in tests
-// that have lots (tens of thousands) of positive Boolean assertions.
-//
-// To use this class with EXPECT_PRED_FORMAT assertions such as:
-//
-//   // Verifies that Foo() returns an even number.
-//   EXPECT_PRED_FORMAT1(IsEven, Foo());
-//
-// you need to define:
-//
-//   testing::AssertionResult IsEven(const char* expr, int n) {
-//     if ((n % 2) == 0)
-//       return testing::AssertionSuccess();
-//     else
-//       return testing::AssertionFailure()
-//         << "Expected: " << expr << " is even\n  Actual: it's " << n;
-//   }
-//
-// If Foo() returns 5, you will see the following message:
-//
-//   Expected: Foo() is even
-//     Actual: it's 5
-//
-class GTEST_API_ AssertionResult {
- public:
-  // Copy constructor.
-  // Used in EXPECT_TRUE/FALSE(assertion_result).
-  AssertionResult(const AssertionResult& other);
-
-// C4800 is a level 3 warning in Visual Studio 2015 and earlier.
-// This warning is not emitted in Visual Studio 2017.
-// This warning is off by default starting in Visual Studio 2019 but can be
-// enabled with command-line options.
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4800 /* forcing value to bool */)
-#endif
-
-  // Used in the EXPECT_TRUE/FALSE(bool_expression).
-  //
-  // T must be contextually convertible to bool.
-  //
-  // The second parameter prevents this overload from being considered if
-  // the argument is implicitly convertible to AssertionResult. In that case
-  // we want AssertionResult's copy constructor to be used.
-  template <typename T>
-  explicit AssertionResult(
-      const T& success,
-      typename std::enable_if<
-          !std::is_convertible<T, AssertionResult>::value>::type*
-      /*enabler*/
-      = nullptr)
-      : success_(success) {}
-
-#if defined(_MSC_VER) && (_MSC_VER < 1910 || _MSC_VER >= 1920)
-  GTEST_DISABLE_MSC_WARNINGS_POP_()
-#endif
-
-  // Assignment operator.
-  AssertionResult& operator=(AssertionResult other) {
-    swap(other);
-    return *this;
-  }
-
-  // Returns true if and only if the assertion succeeded.
-  operator bool() const { return success_; }  // NOLINT
-
-  // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-  AssertionResult operator!() const;
-
-  // Returns the text streamed into this AssertionResult. Test assertions
-  // use it when they fail (i.e., the predicate's outcome doesn't match the
-  // assertion's expectation). When nothing has been streamed into the
-  // object, returns an empty string.
-  const char* message() const {
-    return message_.get() != nullptr ? message_->c_str() : "";
-  }
-  // Deprecated; please use message() instead.
-  const char* failure_message() const { return message(); }
-
-  // Streams a custom failure message into this object.
-  template <typename T> AssertionResult& operator<<(const T& value) {
-    AppendMessage(Message() << value);
-    return *this;
-  }
-
-  // Allows streaming basic output manipulators such as endl or flush into
-  // this object.
-  AssertionResult& operator<<(
-      ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) {
-    AppendMessage(Message() << basic_manipulator);
-    return *this;
-  }
-
- private:
-  // Appends the contents of message to message_.
-  void AppendMessage(const Message& a_message) {
-    if (message_.get() == nullptr) message_.reset(new ::std::string);
-    message_->append(a_message.GetString().c_str());
-  }
-
-  // Swap the contents of this AssertionResult with other.
-  void swap(AssertionResult& other);
-
-  // Stores result of the assertion predicate.
-  bool success_;
-  // Stores the message describing the condition in case the expectation
-  // construct is not satisfied with the predicate's outcome.
-  // Referenced via a pointer to avoid taking too much stack frame space
-  // with test assertions.
-  std::unique_ptr< ::std::string> message_;
-};
-
-// Makes a successful assertion result.
-GTEST_API_ AssertionResult AssertionSuccess();
-
-// Makes a failed assertion result.
-GTEST_API_ AssertionResult AssertionFailure();
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << msg.
-GTEST_API_ AssertionResult AssertionFailure(const Message& msg);
-
-}  // namespace testing
-
-// Includes the auto-generated header that implements a family of generic
-// predicate assertion macros. This include comes late because it relies on
-// APIs declared above.
-#include "gtest/gtest_pred_impl.h"
-
-namespace testing {
-
 // The abstract class that all tests inherit from.
 //
 // In Google Test, a unit test program contains one or many TestSuites, and
@@ -522,7 +339,8 @@ class GTEST_API_ Test {
   virtual Setup_should_be_spelled_SetUp* Setup() { return nullptr; }
 
   // We disallow copying Tests.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Test);
+  Test(const Test&) = delete;
+  Test& operator=(const Test&) = delete;
 };
 
 typedef internal::TimeInMillis TimeInMillis;
@@ -536,24 +354,17 @@ class TestProperty {
   // C'tor.  TestProperty does NOT have a default constructor.
   // Always use this constructor (with parameters) to create a
   // TestProperty object.
-  TestProperty(const std::string& a_key, const std::string& a_value) :
-    key_(a_key), value_(a_value) {
-  }
+  TestProperty(const std::string& a_key, const std::string& a_value)
+      : key_(a_key), value_(a_value) {}
 
   // Gets the user supplied key.
-  const char* key() const {
-    return key_.c_str();
-  }
+  const char* key() const { return key_.c_str(); }
 
   // Gets the user supplied value.
-  const char* value() const {
-    return value_.c_str();
-  }
+  const char* value() const { return value_.c_str(); }
 
   // Sets a new value, overriding the one supplied in the constructor.
-  void SetValue(const std::string& new_value) {
-    value_ = new_value;
-  }
+  void SetValue(const std::string& new_value) { value_ = new_value; }
 
  private:
   // The key supplied by the user.
@@ -687,7 +498,8 @@ class GTEST_API_ TestResult {
   TimeInMillis elapsed_time_;
 
   // We disallow copying TestResult.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult);
+  TestResult(const TestResult&) = delete;
+  TestResult& operator=(const TestResult&) = delete;
 };  // class TestResult
 
 // A TestInfo object stores the following information about a test:
@@ -811,8 +623,8 @@ class GTEST_API_ TestInfo {
   }
 
   // These fields are immutable properties of the test.
-  const std::string test_suite_name_;    // test suite name
-  const std::string name_;               // Test name
+  const std::string test_suite_name_;  // test suite name
+  const std::string name_;             // Test name
   // Name of the parameter type, or NULL if this is not a typed or a
   // type-parameterized test.
   const std::unique_ptr<const ::std::string> type_param_;
@@ -833,7 +645,8 @@ class GTEST_API_ TestInfo {
   // test for the second time.
   TestResult result_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo);
+  TestInfo(const TestInfo&) = delete;
+  TestInfo& operator=(const TestInfo&) = delete;
 };
 
 // A test suite, which consists of a vector of TestInfos.
@@ -941,7 +754,7 @@ class GTEST_API_ TestSuite {
 
   // Adds a TestInfo to this test suite.  Will delete the TestInfo upon
   // destruction of the TestSuite object.
-  void AddTestInfo(TestInfo * test_info);
+  void AddTestInfo(TestInfo* test_info);
 
   // Clears the results of all tests in this test suite.
   void ClearResult();
@@ -1042,7 +855,8 @@ class GTEST_API_ TestSuite {
   TestResult ad_hoc_test_result_;
 
   // We disallow copying TestSuites.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestSuite);
+  TestSuite(const TestSuite&) = delete;
+  TestSuite& operator=(const TestSuite&) = delete;
 };
 
 // An Environment object is capable of setting up and tearing down an
@@ -1069,6 +883,7 @@ class Environment {
 
   // Override this to define how to tear down the environment.
   virtual void TearDown() {}
+
  private:
   // If you see an error about overriding the following function or
   // about it being private, you have mis-spelled SetUp() as Setup().
@@ -1120,6 +935,9 @@ class TestEventListener {
   // Fired before the test starts.
   virtual void OnTestStart(const TestInfo& test_info) = 0;
 
+  // Fired when a test is disabled
+  virtual void OnTestDisabled(const TestInfo& /*test_info*/) {}
+
   // Fired after a failed assertion or a SUCCEED() invocation.
   // If you want to throw an exception from this function to skip to the next
   // TEST, it must be AssertionException defined above, or inherited from it.
@@ -1143,8 +961,7 @@ class TestEventListener {
   virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0;
 
   // Fired after each iteration of tests finishes.
-  virtual void OnTestIterationEnd(const UnitTest& unit_test,
-                                  int iteration) = 0;
+  virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration) = 0;
 
   // Fired after all test activities have ended.
   virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0;
@@ -1169,6 +986,7 @@ class EmptyTestEventListener : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
   void OnTestPartResult(const TestPartResult& /*test_part_result*/) override {}
   void OnTestEnd(const TestInfo& /*test_info*/) override {}
   void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
@@ -1258,7 +1076,8 @@ class GTEST_API_ TestEventListeners {
   TestEventListener* default_xml_generator_;
 
   // We disallow copying TestEventListeners.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners);
+  TestEventListeners(const TestEventListeners&) = delete;
+  TestEventListeners& operator=(const TestEventListeners&) = delete;
 };
 
 // A UnitTest consists of a vector of TestSuites.
@@ -1301,8 +1120,7 @@ class GTEST_API_ UnitTest {
 
   // Returns the TestInfo object for the test that's currently running,
   // or NULL if no test is running.
-  const TestInfo* current_test_info() const
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  const TestInfo* current_test_info() const GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Returns the random seed used at the start of the current test run.
   int random_seed() const;
@@ -1408,8 +1226,7 @@ class GTEST_API_ UnitTest {
   // eventually call this to report their results.  The user code
   // should use the assertion macros instead of calling this directly.
   void AddTestPartResult(TestPartResult::Type result_type,
-                         const char* file_name,
-                         int line_number,
+                         const char* file_name, int line_number,
                          const std::string& message,
                          const std::string& os_stack_trace)
       GTEST_LOCK_EXCLUDED_(mutex_);
@@ -1440,8 +1257,7 @@ class GTEST_API_ UnitTest {
   friend std::set<std::string>* internal::GetIgnoredParameterizedTestSuites();
   friend internal::UnitTestImpl* internal::GetUnitTestImpl();
   friend void internal::ReportFailureInUnknownLocation(
-      TestPartResult::Type result_type,
-      const std::string& message);
+      TestPartResult::Type result_type, const std::string& message);
 
   // Creates an empty UnitTest.
   UnitTest();
@@ -1455,8 +1271,7 @@ class GTEST_API_ UnitTest {
       GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Pops a trace from the per-thread Google Test trace stack.
-  void PopGTestTrace()
-      GTEST_LOCK_EXCLUDED_(mutex_);
+  void PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_);
 
   // Protects mutable state in *impl_.  This is mutable as some const
   // methods need to lock it too.
@@ -1469,7 +1284,8 @@ class GTEST_API_ UnitTest {
   internal::UnitTestImpl* impl_;
 
   // We disallow copying UnitTest.
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest);
+  UnitTest(const UnitTest&) = delete;
+  UnitTest& operator=(const UnitTest&) = delete;
 };
 
 // A convenient wrapper for adding an environment for the test
@@ -1520,13 +1336,11 @@ namespace internal {
 // when calling EXPECT_* in a tight loop.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQFailure(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const T1& lhs, const T2& rhs) {
-  return EqFailure(lhs_expression,
-                   rhs_expression,
+                                   const char* rhs_expression, const T1& lhs,
+                                   const T2& rhs) {
+  return EqFailure(lhs_expression, rhs_expression,
                    FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
+                   FormatForComparisonFailureMessage(rhs, lhs), false);
 }
 
 // This block of code defines operator==/!=
@@ -1539,8 +1353,7 @@ inline bool operator!=(faketype, faketype) { return false; }
 // The helper function for {ASSERT|EXPECT}_EQ.
 template <typename T1, typename T2>
 AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            const T1& lhs,
+                            const char* rhs_expression, const T1& lhs,
                             const T2& rhs) {
   if (lhs == rhs) {
     return AssertionSuccess();
@@ -1571,8 +1384,7 @@ class EqHelper {
   // Even though its body looks the same as the above version, we
   // cannot merge the two, as it will make anonymous enums unhappy.
   static AssertionResult Compare(const char* lhs_expression,
-                                 const char* rhs_expression,
-                                 BiggestInt lhs,
+                                 const char* rhs_expression, BiggestInt lhs,
                                  BiggestInt rhs) {
     return CmpHelperEQ(lhs_expression, rhs_expression, lhs, rhs);
   }
@@ -1607,16 +1419,16 @@ AssertionResult CmpHelperOpFailure(const char* expr1, const char* expr2,
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-template <typename T1, typename T2>\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   const T1& val1, const T2& val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
-  }\
-}
+#define GTEST_IMPL_CMP_HELPER_(op_name, op)                                \
+  template <typename T1, typename T2>                                      \
+  AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
+                                     const T1& val1, const T2& val2) {     \
+    if (val1 op val2) {                                                    \
+      return AssertionSuccess();                                           \
+    } else {                                                               \
+      return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);            \
+    }                                                                      \
+  }
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
@@ -1638,49 +1450,42 @@ GTEST_IMPL_CMP_HELPER_(GT, >)
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
+                                              const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const char* s1,
-                                          const char* s2);
+                                          const char* s1, const char* s2);
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
                                               const char* s2_expression,
-                                              const char* s1,
-                                              const char* s2);
-
+                                              const char* s1, const char* s2);
 
 // Helper function for *_STREQ on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTREQ(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 // Helper function for *_STRNE on wide strings.
 //
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
                                           const char* s2_expression,
-                                          const wchar_t* s1,
-                                          const wchar_t* s2);
+                                          const wchar_t* s1, const wchar_t* s2);
 
 }  // namespace internal
 
@@ -1692,32 +1497,40 @@ GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 //
 // The {needle,haystack}_expr arguments are the stringified
 // expressions that generated the two real arguments.
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack);
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const char* needle,
+                                       const char* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const wchar_t* needle,
+                                       const wchar_t* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const char* needle,
+                                          const char* haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const wchar_t* needle,
+                                          const wchar_t* haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::string& needle,
+                                       const ::std::string& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::string& needle,
+                                          const ::std::string& haystack);
 
 #if GTEST_HAS_STD_WSTRING
-GTEST_API_ AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
-GTEST_API_ AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsSubstring(const char* needle_expr,
+                                       const char* haystack_expr,
+                                       const ::std::wstring& needle,
+                                       const ::std::wstring& haystack);
+GTEST_API_ AssertionResult IsNotSubstring(const char* needle_expr,
+                                          const char* haystack_expr,
+                                          const ::std::wstring& needle,
+                                          const ::std::wstring& haystack);
 #endif  // GTEST_HAS_STD_WSTRING
 
 namespace internal {
@@ -1732,8 +1545,7 @@ namespace internal {
 template <typename RawType>
 AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
                                          const char* rhs_expression,
-                                         RawType lhs_value,
-                                         RawType rhs_value) {
+                                         RawType lhs_value, RawType rhs_value) {
   const FloatingPoint<RawType> lhs(lhs_value), rhs(rhs_value);
 
   if (lhs.AlmostEquals(rhs)) {
@@ -1748,10 +1560,8 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
   rhs_ss << std::setprecision(std::numeric_limits<RawType>::digits10 + 2)
          << rhs_value;
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   StringStreamToString(&lhs_ss),
-                   StringStreamToString(&rhs_ss),
+  return EqFailure(lhs_expression, rhs_expression,
+                   StringStreamToString(&lhs_ss), StringStreamToString(&rhs_ss),
                    false);
 }
 
@@ -1761,8 +1571,7 @@ AssertionResult CmpHelperFloatingPointEQ(const char* lhs_expression,
 GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
                                                 const char* expr2,
                                                 const char* abs_error_expr,
-                                                double val1,
-                                                double val2,
+                                                double val1, double val2,
                                                 double abs_error);
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -1770,9 +1579,7 @@ GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1,
 class GTEST_API_ AssertHelper {
  public:
   // Constructor.
-  AssertHelper(TestPartResult::Type type,
-               const char* file,
-               int line,
+  AssertHelper(TestPartResult::Type type, const char* file, int line,
                const char* message);
   ~AssertHelper();
 
@@ -1786,11 +1593,9 @@ class GTEST_API_ AssertHelper {
   // re-using stack space even for temporary variables, so every EXPECT_EQ
   // reserves stack space for another AssertHelper.
   struct AssertHelperData {
-    AssertHelperData(TestPartResult::Type t,
-                     const char* srcfile,
-                     int line_num,
+    AssertHelperData(TestPartResult::Type t, const char* srcfile, int line_num,
                      const char* msg)
-        : type(t), file(srcfile), line(line_num), message(msg) { }
+        : type(t), file(srcfile), line(line_num), message(msg) {}
 
     TestPartResult::Type const type;
     const char* const file;
@@ -1798,12 +1603,14 @@ class GTEST_API_ AssertHelper {
     std::string const message;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData);
+    AssertHelperData(const AssertHelperData&) = delete;
+    AssertHelperData& operator=(const AssertHelperData&) = delete;
   };
 
   AssertHelperData* const data_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
+  AssertHelper(const AssertHelper&) = delete;
+  AssertHelper& operator=(const AssertHelper&) = delete;
 };
 
 }  // namespace internal
@@ -1860,15 +1667,14 @@ class WithParamInterface {
  private:
   // Sets parameter value. The caller is responsible for making sure the value
   // remains alive and unchanged throughout the current test.
-  static void SetParam(const ParamType* parameter) {
-    parameter_ = parameter;
-  }
+  static void SetParam(const ParamType* parameter) { parameter_ = parameter; }
 
   // Static value used for accessing parameter during a test lifetime.
   static const ParamType* parameter_;
 
   // TestClass must be a subclass of WithParamInterface<T> and Test.
-  template <class TestClass> friend class internal::ParameterizedTestFactory;
+  template <class TestClass>
+  friend class internal::ParameterizedTestFactory;
 };
 
 template <typename T>
@@ -1878,8 +1684,7 @@ const T* WithParamInterface<T>::parameter_ = nullptr;
 // WithParamInterface, and can just inherit from ::testing::TestWithParam.
 
 template <typename T>
-class TestWithParam : public Test, public WithParamInterface<T> {
-};
+class TestWithParam : public Test, public WithParamInterface<T> {};
 
 // Macros for indicating success/failure in test code.
 
@@ -1910,7 +1715,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 
 // Generates a nonfatal failure at the given source file location with
 // a generic message.
-#define ADD_FAILURE_AT(file, line) \
+#define ADD_FAILURE_AT(file, line)        \
   GTEST_MESSAGE_AT_(file, line, "Failed", \
                     ::testing::TestPartResult::kNonFatalFailure)
 
@@ -1925,7 +1730,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of FAIL(), which is a
 // generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_FAIL
-# define FAIL() GTEST_FAIL()
+#define FAIL() GTEST_FAIL()
 #endif
 
 // Generates a success with a generic message.
@@ -1934,7 +1739,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Define this macro to 1 to omit the definition of SUCCEED(), which
 // is a generic name and clashes with some other libraries.
 #if !GTEST_DONT_DEFINE_SUCCEED
-# define SUCCEED() GTEST_SUCCEED()
+#define SUCCEED() GTEST_SUCCEED()
 #endif
 
 // Macros for testing exceptions.
@@ -1962,16 +1767,15 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define GTEST_EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition)                      \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define GTEST_EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
 #define GTEST_ASSERT_TRUE(condition) \
-  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
-                      GTEST_FATAL_FAILURE_)
-#define GTEST_ASSERT_FALSE(condition) \
+  GTEST_TEST_BOOLEAN_(condition, #condition, false, true, GTEST_FATAL_FAILURE_)
+#define GTEST_ASSERT_FALSE(condition)                        \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
@@ -2070,27 +1874,27 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // ASSERT_XY(), which clashes with some users' own code.
 
 #if !GTEST_DONT_DEFINE_ASSERT_EQ
-# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
+#define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_NE
-# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
+#define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LE
-# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
+#define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_LT
-# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
+#define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GE
-# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
+#define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2)
 #endif
 
 #if !GTEST_DONT_DEFINE_ASSERT_GT
-# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
+#define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2)
 #endif
 
 // C-string Comparisons.  All tests treat NULL and any non-NULL string
@@ -2115,7 +1919,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define EXPECT_STRCASEEQ(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define EXPECT_STRCASENE(s1, s2)\
+#define EXPECT_STRCASENE(s1, s2) \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 #define ASSERT_STREQ(s1, s2) \
@@ -2124,7 +1928,7 @@ class TestWithParam : public Test, public WithParamInterface<T> {
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2)
 #define ASSERT_STRCASEEQ(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, s1, s2)
-#define ASSERT_STRCASENE(s1, s2)\
+#define ASSERT_STRCASENE(s1, s2) \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2)
 
 // Macros for comparing floating-point numbers.
@@ -2141,29 +1945,29 @@ class TestWithParam : public Test, public WithParamInterface<T> {
 // FloatingPoint template class in gtest-internal.h if you are
 // interested in the implementation details.
 
-#define EXPECT_FLOAT_EQ(val1, val2)\
+#define EXPECT_FLOAT_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define EXPECT_DOUBLE_EQ(val1, val2)\
+#define EXPECT_DOUBLE_EQ(val1, val2)                                         \
   EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define ASSERT_FLOAT_EQ(val1, val2)\
+#define ASSERT_FLOAT_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<float>, \
                       val1, val2)
 
-#define ASSERT_DOUBLE_EQ(val1, val2)\
+#define ASSERT_DOUBLE_EQ(val1, val2)                                         \
   ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ<double>, \
                       val1, val2)
 
-#define EXPECT_NEAR(val1, val2, abs_error)\
-  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define EXPECT_NEAR(val1, val2, abs_error)                                   \
+  EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
-#define ASSERT_NEAR(val1, val2, abs_error)\
-  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \
-                      val1, val2, abs_error)
+#define ASSERT_NEAR(val1, val2, abs_error)                                   \
+  ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, val1, val2, \
+                      abs_error)
 
 // These predicate format functions work on floating-point values, and
 // can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g.
@@ -2177,7 +1981,6 @@ GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2,
 GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
                                     double val1, double val2);
 
-
 #if GTEST_OS_WINDOWS
 
 // Macros that test for HRESULT failure and success, these are only useful
@@ -2189,17 +1992,17 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 // expected result and the actual result with both a human-readable
 // string representation of the error, if available, as well as the
 // hex result code.
-# define EXPECT_HRESULT_SUCCEEDED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define EXPECT_HRESULT_SUCCEEDED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define ASSERT_HRESULT_SUCCEEDED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
+#define ASSERT_HRESULT_SUCCEEDED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr))
 
-# define EXPECT_HRESULT_FAILED(expr) \
-    EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define EXPECT_HRESULT_FAILED(expr) \
+  EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
-# define ASSERT_HRESULT_FAILED(expr) \
-    ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
+#define ASSERT_HRESULT_FAILED(expr) \
+  ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr))
 
 #endif  // GTEST_OS_WINDOWS
 
@@ -2214,9 +2017,9 @@ GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2,
 //   ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed";
 //
 #define ASSERT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_)
 #define EXPECT_NO_FATAL_FAILURE(statement) \
-    GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
+  GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_)
 
 // Causes a trace (including the given source file path and line number,
 // and the given message) to be included in every test failure message generated
@@ -2258,7 +2061,8 @@ class GTEST_API_ ScopedTrace {
  private:
   void PushTrace(const char* file, int line, std::string message);
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace);
+  ScopedTrace(const ScopedTrace&) = delete;
+  ScopedTrace& operator=(const ScopedTrace&) = delete;
 } GTEST_ATTRIBUTE_UNUSED_;  // A ScopedTrace object does its job in its
                             // c'tor and d'tor.  Therefore it doesn't
                             // need to be used otherwise.
@@ -2278,9 +2082,9 @@ class GTEST_API_ ScopedTrace {
 // Assuming that each thread maintains its own stack of traces.
 // Therefore, a SCOPED_TRACE() would (correctly) only affect the
 // assertions in its own thread.
-#define SCOPED_TRACE(message) \
-  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\
-    __FILE__, __LINE__, (message))
+#define SCOPED_TRACE(message)                                         \
+  ::testing::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)( \
+      __FILE__, __LINE__, (message))
 
 // Compile-time assertion for type equality.
 // StaticAssertTypeEq<type1, type2>() compiles if and only if type1 and type2
@@ -2378,20 +2182,19 @@ constexpr bool StaticAssertTypeEq() noexcept {
 //     EXPECT_EQ(a_.size(), 0);
 //     EXPECT_EQ(b_.size(), 1);
 //   }
-//
-// GOOGLETEST_CM0011 DO NOT DELETE
-#if !GTEST_DONT_DEFINE_TEST
-#define TEST_F(test_fixture, test_name)\
+#define GTEST_TEST_F(test_fixture, test_name)        \
   GTEST_TEST_(test_fixture, test_name, test_fixture, \
               ::testing::internal::GetTypeId<test_fixture>())
-#endif  // !GTEST_DONT_DEFINE_TEST
+#if !GTEST_DONT_DEFINE_TEST_F
+#define TEST_F(test_fixture, test_name) GTEST_TEST_F(test_fixture, test_name)
+#endif
 
 // Returns a path to temporary directory.
 // Tries to determine an appropriate directory for the platform.
 GTEST_API_ std::string TempDir();
 
 #ifdef _MSC_VER
-#  pragma warning(pop)
+#pragma warning(pop)
 #endif
 
 // Dynamically registers a test with the framework.
@@ -2445,6 +2248,7 @@ GTEST_API_ std::string TempDir();
 // }
 // ...
 // int main(int argc, char** argv) {
+//   ::testing::InitGoogleTest(&argc, argv);
 //   std::vector<int> values_to_test = LoadValuesFromConfig();
 //   RegisterMyTests(values_to_test);
 //   ...
@@ -2486,9 +2290,7 @@ TestInfo* RegisterTest(const char* test_suite_name, const char* test_name,
 // namespace and has an all-caps name.
 int RUN_ALL_TESTS() GTEST_MUST_USE_RESULT_;
 
-inline int RUN_ALL_TESTS() {
-  return ::testing::UnitTest::GetInstance()->Run();
-}
+inline int RUN_ALL_TESTS() { return ::testing::UnitTest::GetInstance()->Run(); }
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index 5029a9bb0..47a24aa68 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -26,17 +26,19 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-// This file is AUTOMATICALLY GENERATED on 01/02/2019 by command
-// 'gen_gtest_pred_impl.py 5'.  DO NOT EDIT BY HAND!
 //
 // Implements a family of generic predicate assertion macros.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
-#include "gtest/gtest.h"
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 
@@ -72,22 +74,18 @@ namespace testing {
 // GTEST_ASSERT_ is the basic statement to which all of the assertions
 // in this file reduce.  Don't use this in your code.
 
-#define GTEST_ASSERT_(expression, on_failure) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+#define GTEST_ASSERT_(expression, on_failure)                   \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                 \
   if (const ::testing::AssertionResult gtest_ar = (expression)) \
-    ; \
-  else \
+    ;                                                           \
+  else                                                          \
     on_failure(gtest_ar.failure_message())
 
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1>
-AssertionResult AssertPred1Helper(const char* pred_text,
-                                  const char* e1,
-                                  Pred pred,
-                                  const T1& v1) {
+template <typename Pred, typename T1>
+AssertionResult AssertPred1Helper(const char* pred_text, const char* e1,
+                                  Pred pred, const T1& v1) {
   if (pred(v1)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -98,40 +96,27 @@ AssertionResult AssertPred1Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, v1), \
-                on_failure)
+#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, v1), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED1.  Don't use
 // this in your code.
-#define GTEST_PRED1_(pred, v1, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \
-                                             #v1, \
-                                             pred, \
-                                             v1), on_failure)
+#define GTEST_PRED1_(pred, v1, on_failure) \
+  GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, #v1, pred, v1), on_failure)
 
 // Unary predicate assertion macros.
 #define EXPECT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_)
-#define EXPECT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
+#define EXPECT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_)
 #define ASSERT_PRED_FORMAT1(pred_format, v1) \
   GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_)
-#define ASSERT_PRED1(pred, v1) \
-  GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
-
-
+#define ASSERT_PRED1(pred, v1) GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_)
 
 // Helper function for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2>
-AssertionResult AssertPred2Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  Pred pred,
-                                  const T1& v1,
+template <typename Pred, typename T1, typename T2>
+AssertionResult AssertPred2Helper(const char* pred_text, const char* e1,
+                                  const char* e2, Pred pred, const T1& v1,
                                   const T2& v2) {
   if (pred(v1, v2)) return AssertionSuccess();
 
@@ -145,19 +130,14 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), \
-                on_failure)
+#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED2.  Don't use
 // this in your code.
-#define GTEST_PRED2_(pred, v1, v2, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             pred, \
-                                             v1, \
-                                             v2), on_failure)
+#define GTEST_PRED2_(pred, v1, v2, on_failure)                               \
+  GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, #v1, #v2, pred, v1, v2), \
+                on_failure)
 
 // Binary predicate assertion macros.
 #define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \
@@ -169,22 +149,12 @@ AssertionResult AssertPred2Helper(const char* pred_text,
 #define ASSERT_PRED2(pred, v1, v2) \
   GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3>
-AssertionResult AssertPred3Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3) {
+template <typename Pred, typename T1, typename T2, typename T3>
+AssertionResult AssertPred3Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3) {
   if (pred(v1, v2, v3)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -198,21 +168,15 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), \
-                on_failure)
+#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED3.  Don't use
 // this in your code.
-#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3), on_failure)
+#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)                          \
+  GTEST_ASSERT_(                                                            \
+      ::testing::AssertPred3Helper(#pred, #v1, #v2, #v3, pred, v1, v2, v3), \
+      on_failure)
 
 // Ternary predicate assertion macros.
 #define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \
@@ -224,25 +188,13 @@ AssertionResult AssertPred3Helper(const char* pred_text,
 #define ASSERT_PRED3(pred, v1, v2, v3) \
   GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4>
-AssertionResult AssertPred4Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4) {
+template <typename Pred, typename T1, typename T2, typename T3, typename T4>
+AssertionResult AssertPred4Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, Pred pred, const T1& v1,
+                                  const T2& v2, const T3& v3, const T4& v4) {
   if (pred(v1, v2, v3, v4)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -257,23 +209,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), \
-                on_failure)
+#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure) \
+  GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4), on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED4.  Don't use
 // this in your code.
-#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4), on_failure)
+#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)                        \
+  GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, #v1, #v2, #v3, #v4, pred, \
+                                             v1, v2, v3, v4),                 \
+                on_failure)
 
 // 4-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \
@@ -285,28 +229,15 @@ AssertionResult AssertPred4Helper(const char* pred_text,
 #define ASSERT_PRED4(pred, v1, v2, v3, v4) \
   GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_)
 
-
-
 // Helper function for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-template <typename Pred,
-          typename T1,
-          typename T2,
-          typename T3,
-          typename T4,
+template <typename Pred, typename T1, typename T2, typename T3, typename T4,
           typename T5>
-AssertionResult AssertPred5Helper(const char* pred_text,
-                                  const char* e1,
-                                  const char* e2,
-                                  const char* e3,
-                                  const char* e4,
-                                  const char* e5,
-                                  Pred pred,
-                                  const T1& v1,
-                                  const T2& v2,
-                                  const T3& v3,
-                                  const T4& v4,
-                                  const T5& v5) {
+AssertionResult AssertPred5Helper(const char* pred_text, const char* e1,
+                                  const char* e2, const char* e3,
+                                  const char* e4, const char* e5, Pred pred,
+                                  const T1& v1, const T2& v2, const T3& v3,
+                                  const T4& v4, const T5& v5) {
   if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess();
 
   return AssertionFailure()
@@ -322,25 +253,16 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5.
 // Don't use this in your code.
-#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\
+#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)  \
   GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5), \
                 on_failure)
 
 // Internal macro for implementing {EXPECT|ASSERT}_PRED5.  Don't use
 // this in your code.
-#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\
-  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \
-                                             #v1, \
-                                             #v2, \
-                                             #v3, \
-                                             #v4, \
-                                             #v5, \
-                                             pred, \
-                                             v1, \
-                                             v2, \
-                                             v3, \
-                                             v4, \
-                                             v5), on_failure)
+#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)                   \
+  GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, #v1, #v2, #v3, #v4, #v5, \
+                                             pred, v1, v2, v3, v4, v5),      \
+                on_failure)
 
 // 5-ary predicate assertion macros.
 #define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \
@@ -352,8 +274,6 @@ AssertionResult AssertPred5Helper(const char* pred_text,
 #define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \
   GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_)
 
-
-
 }  // namespace testing
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
index 38b9d85a5..1f37dc31c 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -27,9 +27,8 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-//
-// Google C++ Testing and Mocking Framework definitions useful in production code.
-// GOOGLETEST_CM0003 DO NOT DELETE
+// Google C++ Testing and Mocking Framework definitions useful in production
+// code.
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 #define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
@@ -55,7 +54,7 @@
 // Note: The test class must be in the same namespace as the class being tested.
 // For example, putting MyClassTest in an anonymous namespace will not work.
 
-#define FRIEND_TEST(test_case_name, test_name)\
-friend class test_case_name##_##test_name##_Test
+#define FRIEND_TEST(test_case_name, test_name) \
+  friend class test_case_name##_##test_name##_Test
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md b/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
index ff391fb4e..cb49e2c75 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/README.md
@@ -15,18 +15,6 @@ The custom directory is an injection point for custom user configurations.
 
 The following macros can be defined:
 
-### Flag related macros:
-
-*   `GTEST_FLAG(flag_name)`
-*   `GTEST_USE_OWN_FLAGFILE_FLAG_` - Define to 0 when the system provides its
-    own flagfile flag parsing.
-*   `GTEST_DECLARE_bool_(name)`
-*   `GTEST_DECLARE_int32_(name)`
-*   `GTEST_DECLARE_string_(name)`
-*   `GTEST_DEFINE_bool_(name, default_val, doc)`
-*   `GTEST_DEFINE_int32_(name, default_val, doc)`
-*   `GTEST_DEFINE_string_(name, default_val, doc)`
-
 ### Logging:
 
 *   `GTEST_LOG_(severity)`
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index db02881c0..9b7fb4261 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -34,4 +34,35 @@
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
+// Use a stub Notification class.
+//
+// The built-in Notification class in GoogleTest v1.12.1 uses std::mutex and
+// std::condition_variable. The <mutex> and <condition_variable> headers of
+// mingw32 g++ (GNU 10.0.0) define std::mutex and std::condition_variable only
+// when configured with the posix threads option but don't define them when
+// configured with the win32 threads option. The Notification class is only
+// used in GoogleTest's internal tests. Since we don't build GoogleTest's
+// internal tests, we don't need a working Notification class. Although it's
+// not hard to fix the mingw32 g++ compilation errors by implementing the
+// Notification class using Windows CRITICAL_SECTION and CONDITION_VARIABLE,
+// it's simpler to just use a stub Notification class on all platforms.
+//
+// The default constructor of the stub class is deleted and the declaration of
+// the Notify() method is commented out, so that compilation will fail if any
+// code actually uses the Notification class.
+
+#define GTEST_HAS_NOTIFICATION_ 1
+namespace testing {
+namespace internal {
+class Notification {
+ public:
+  Notification() = delete;
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
+  // void Notify();
+  void WaitForNotification() {}
+};
+}  // namespace internal
+}  // namespace testing
+
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 490296dfa..45580ae80 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -26,27 +26,31 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines internal utilities needed for implementing
 // death tests.  They are subject to change without notice.
-// GOOGLETEST_CM0001 DO NOT DELETE
+
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
+#include <stdio.h>
+
+#include <memory>
+
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
 
-#include <stdio.h>
-#include <memory>
+GTEST_DECLARE_string_(internal_run_death_test);
 
 namespace testing {
 namespace internal {
 
-GTEST_DECLARE_string_(internal_run_death_test);
-
 // Names of the flags (needed for parsing Google Test flags).
 const char kDeathTestStyleFlag[] = "death_test_style";
 const char kDeathTestUseFork[] = "death_test_use_fork";
@@ -83,16 +87,18 @@ class GTEST_API_ DeathTest {
   static bool Create(const char* statement, Matcher<const std::string&> matcher,
                      const char* file, int line, DeathTest** test);
   DeathTest();
-  virtual ~DeathTest() { }
+  virtual ~DeathTest() {}
 
   // A helper class that aborts a death test when it's deleted.
   class ReturnSentinel {
    public:
-    explicit ReturnSentinel(DeathTest* test) : test_(test) { }
+    explicit ReturnSentinel(DeathTest* test) : test_(test) {}
     ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); }
+
    private:
     DeathTest* const test_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel);
+    ReturnSentinel(const ReturnSentinel&) = delete;
+    ReturnSentinel& operator=(const ReturnSentinel&) = delete;
   } GTEST_ATTRIBUTE_UNUSED_;
 
   // An enumeration of possible roles that may be taken when a death
@@ -137,7 +143,8 @@ class GTEST_API_ DeathTest {
   // A string containing a description of the outcome of the last death test.
   static std::string last_death_test_message_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest);
+  DeathTest(const DeathTest&) = delete;
+  DeathTest& operator=(const DeathTest&) = delete;
 };
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
@@ -145,7 +152,7 @@ GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 // Factory interface for death tests.  May be mocked out for testing.
 class DeathTestFactory {
  public:
-  virtual ~DeathTestFactory() { }
+  virtual ~DeathTestFactory() {}
   virtual bool Create(const char* statement,
                       Matcher<const std::string&> matcher, const char* file,
                       int line, DeathTest** test) = 0;
@@ -186,28 +193,28 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 
 // Traps C++ exceptions escaping statement and reports them as test
 // failures. Note that trapping SEH exceptions is not implemented here.
-# if GTEST_HAS_EXCEPTIONS
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
-  try { \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-  } catch (const ::std::exception& gtest_exception) { \
-    fprintf(\
-        stderr, \
-        "\n%s: Caught std::exception-derived exception escaping the " \
-        "death test statement. Exception message: %s\n", \
+#if GTEST_HAS_EXCEPTIONS
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test)           \
+  try {                                                                      \
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);               \
+  } catch (const ::std::exception& gtest_exception) {                        \
+    fprintf(                                                                 \
+        stderr,                                                              \
+        "\n%s: Caught std::exception-derived exception escaping the "        \
+        "death test statement. Exception message: %s\n",                     \
         ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \
-        gtest_exception.what()); \
-    fflush(stderr); \
+        gtest_exception.what());                                             \
+    fflush(stderr);                                                          \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
-  } catch (...) { \
+  } catch (...) {                                                            \
     death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \
   }
 
-# else
-#  define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
+#else
+#define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \
   GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement)
 
-# endif
+#endif
 
 // This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*,
 // ASSERT_EXIT*, and EXPECT_EXIT*.
@@ -236,8 +243,6 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
           gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE);   \
           break;                                                               \
         }                                                                      \
-        default:                                                               \
-          break;                                                               \
       }                                                                        \
     }                                                                          \
   } else                                                                       \
@@ -265,16 +270,12 @@ inline Matcher<const ::std::string&> MakeDeathTestMatcher(
 // RUN_ALL_TESTS was called.
 class InternalRunDeathTestFlag {
  public:
-  InternalRunDeathTestFlag(const std::string& a_file,
-                           int a_line,
-                           int an_index,
+  InternalRunDeathTestFlag(const std::string& a_file, int a_line, int an_index,
                            int a_write_fd)
-      : file_(a_file), line_(a_line), index_(an_index),
-        write_fd_(a_write_fd) {}
+      : file_(a_file), line_(a_line), index_(an_index), write_fd_(a_write_fd) {}
 
   ~InternalRunDeathTestFlag() {
-    if (write_fd_ >= 0)
-      posix::Close(write_fd_);
+    if (write_fd_ >= 0) posix::Close(write_fd_);
   }
 
   const std::string& file() const { return file_; }
@@ -288,7 +289,8 @@ class InternalRunDeathTestFlag {
   int index_;
   int write_fd_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag);
+  InternalRunDeathTestFlag(const InternalRunDeathTestFlag&) = delete;
+  InternalRunDeathTestFlag& operator=(const InternalRunDeathTestFlag&) = delete;
 };
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index 0c033abc3..a2a60a962 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Google Test filepath utilities
 //
 // This header file declares classes and functions used internally by
@@ -35,7 +35,9 @@
 // This file is #included in gtest/internal/gtest-internal.h.
 // Do not include this header file separately!
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
@@ -61,8 +63,8 @@ namespace internal {
 
 class GTEST_API_ FilePath {
  public:
-  FilePath() : pathname_("") { }
-  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { }
+  FilePath() : pathname_("") {}
+  FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) {}
 
   explicit FilePath(const std::string& pathname) : pathname_(pathname) {
     Normalize();
@@ -73,9 +75,7 @@ class GTEST_API_ FilePath {
     return *this;
   }
 
-  void Set(const FilePath& rhs) {
-    pathname_ = rhs.pathname_;
-  }
+  void Set(const FilePath& rhs) { pathname_ = rhs.pathname_; }
 
   const std::string& string() const { return pathname_; }
   const char* c_str() const { return pathname_.c_str(); }
@@ -88,8 +88,7 @@ class GTEST_API_ FilePath {
   // than zero (e.g., 12), returns "dir/test_12.xml".
   // On Windows platform, uses \ as the separator rather than /.
   static FilePath MakeFileName(const FilePath& directory,
-                               const FilePath& base_name,
-                               int number,
+                               const FilePath& base_name, int number,
                                const char* extension);
 
   // Given directory = "dir", relative_path = "test.xml",
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index f8cbdbd81..9b04e4c85 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -26,13 +26,15 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares functions and macros used internally by
 // Google Test.  They are subject to change without notice.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
@@ -40,19 +42,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_LINUX
-# include <stdlib.h>
-# include <sys/types.h>
-# include <sys/wait.h>
-# include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <unistd.h>
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #include <ctype.h>
 #include <float.h>
 #include <string.h>
+
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -76,7 +79,7 @@
 // the current line number.  For more details, see
 // http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6
 #define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar)
-#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar
+#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo##bar
 
 // Stringifies its argument.
 // Work around a bug in visual studio which doesn't accept code like this:
@@ -98,21 +101,21 @@ namespace testing {
 
 // Forward declarations.
 
-class AssertionResult;                 // Result of an assertion.
-class Message;                         // Represents a failure message.
-class Test;                            // Represents a test.
-class TestInfo;                        // Information about a test.
-class TestPartResult;                  // Result of a test part.
-class UnitTest;                        // A collection of test suites.
+class AssertionResult;  // Result of an assertion.
+class Message;          // Represents a failure message.
+class Test;             // Represents a test.
+class TestInfo;         // Information about a test.
+class TestPartResult;   // Result of a test part.
+class UnitTest;         // A collection of test suites.
 
 template <typename T>
 ::std::string PrintToString(const T& value);
 
 namespace internal {
 
-struct TraceInfo;                      // Information about a trace point.
-class TestInfoImpl;                    // Opaque implementation of TestInfo
-class UnitTestImpl;                    // Opaque implementation of UnitTest
+struct TraceInfo;    // Information about a trace point.
+class TestInfoImpl;  // Opaque implementation of TestInfo
+class UnitTestImpl;  // Opaque implementation of UnitTest
 
 // The text used in failure messages to indicate the start of the
 // stack trace.
@@ -121,6 +124,7 @@ GTEST_API_ extern const char kStackTraceMarker[];
 // An IgnoredValue object can be implicitly constructed from ANY value.
 class IgnoredValue {
   struct Sink {};
+
  public:
   // This constructor template allows any value to be implicitly
   // converted to IgnoredValue.  The object has no data member and
@@ -136,13 +140,13 @@ class IgnoredValue {
 };
 
 // Appends the user-supplied message to the Google-Test-generated message.
-GTEST_API_ std::string AppendUserMessage(
-    const std::string& gtest_msg, const Message& user_msg);
+GTEST_API_ std::string AppendUserMessage(const std::string& gtest_msg,
+                                         const Message& user_msg);
 
 #if GTEST_HAS_EXCEPTIONS
 
-GTEST_DISABLE_MSC_WARNINGS_PUSH_(4275 \
-/* an exported class was derived from a class that was not exported */)
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(
+    4275 /* an exported class was derived from a class that was not exported */)
 
 // This exception is thrown by (and only by) a failed Google Test
 // assertion when GTEST_FLAG(throw_on_failure) is true (if exceptions
@@ -181,14 +185,6 @@ GTEST_API_ std::string CreateUnifiedDiff(const std::vector<std::string>& left,
 
 }  // namespace edit_distance
 
-// Calculate the diff between 'left' and 'right' and return it in unified diff
-// format.
-// If not null, stores in 'total_line_count' the total number of lines found
-// in left + right.
-GTEST_API_ std::string DiffStrings(const std::string& left,
-                                   const std::string& right,
-                                   size_t* total_line_count);
-
 // Constructs and returns the message for an equality assertion
 // (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure.
 //
@@ -212,10 +208,8 @@ GTEST_API_ AssertionResult EqFailure(const char* expected_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 GTEST_API_ std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value);
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value);
 
 // This template class represents an IEEE floating-point number
 // (either single-precision or double-precision, depending on the
@@ -256,11 +250,11 @@ class FloatingPoint {
   // Constants.
 
   // # of bits in a number.
-  static const size_t kBitCount = 8*sizeof(RawType);
+  static const size_t kBitCount = 8 * sizeof(RawType);
 
   // # of fraction bits in a number.
   static const size_t kFractionBitCount =
-    std::numeric_limits<RawType>::digits - 1;
+      std::numeric_limits<RawType>::digits - 1;
 
   // # of exponent bits in a number.
   static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount;
@@ -269,8 +263,8 @@ class FloatingPoint {
   static const Bits kSignBitMask = static_cast<Bits>(1) << (kBitCount - 1);
 
   // The mask for the fraction bits.
-  static const Bits kFractionBitMask =
-    ~static_cast<Bits>(0) >> (kExponentBitCount + 1);
+  static const Bits kFractionBitMask = ~static_cast<Bits>(0) >>
+                                       (kExponentBitCount + 1);
 
   // The mask for the exponent bits.
   static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask);
@@ -309,9 +303,7 @@ class FloatingPoint {
   }
 
   // Returns the floating-point number that represent positive infinity.
-  static RawType Infinity() {
-    return ReinterpretBits(kExponentBitMask);
-  }
+  static RawType Infinity() { return ReinterpretBits(kExponentBitMask); }
 
   // Returns the maximum representable finite floating-point number.
   static RawType Max();
@@ -319,7 +311,7 @@ class FloatingPoint {
   // Non-static methods
 
   // Returns the bits that represents this number.
-  const Bits &bits() const { return u_.bits_; }
+  const Bits& bits() const { return u_.bits_; }
 
   // Returns the exponent bits of this number.
   Bits exponent_bits() const { return kExponentBitMask & u_.bits_; }
@@ -348,8 +340,8 @@ class FloatingPoint {
     // a NAN must return false.
     if (is_nan() || rhs.is_nan()) return false;
 
-    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_)
-        <= kMaxUlps;
+    return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) <=
+           kMaxUlps;
   }
 
  private:
@@ -374,7 +366,7 @@ class FloatingPoint {
   //
   // Read http://en.wikipedia.org/wiki/Signed_number_representations
   // for more details on signed number representations.
-  static Bits SignAndMagnitudeToBiased(const Bits &sam) {
+  static Bits SignAndMagnitudeToBiased(const Bits& sam) {
     if (kSignBitMask & sam) {
       // sam represents a negative number.
       return ~sam + 1;
@@ -386,8 +378,8 @@ class FloatingPoint {
 
   // Given two numbers in the sign-and-magnitude representation,
   // returns the distance between them as an unsigned number.
-  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1,
-                                                     const Bits &sam2) {
+  static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits& sam1,
+                                                     const Bits& sam2) {
     const Bits biased1 = SignAndMagnitudeToBiased(sam1);
     const Bits biased2 = SignAndMagnitudeToBiased(sam2);
     return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1);
@@ -399,9 +391,13 @@ class FloatingPoint {
 // We cannot use std::numeric_limits<T>::max() as it clashes with the max()
 // macro defined by <windows.h>.
 template <>
-inline float FloatingPoint<float>::Max() { return FLT_MAX; }
+inline float FloatingPoint<float>::Max() {
+  return FLT_MAX;
+}
 template <>
-inline double FloatingPoint<double>::Max() { return DBL_MAX; }
+inline double FloatingPoint<double>::Max() {
+  return DBL_MAX;
+}
 
 // Typedefs the instances of the FloatingPoint template class that we
 // care to use.
@@ -461,7 +457,8 @@ class TestFactoryBase {
   TestFactoryBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase);
+  TestFactoryBase(const TestFactoryBase&) = delete;
+  TestFactoryBase& operator=(const TestFactoryBase&) = delete;
 };
 
 // This class provides implementation of TeastFactoryBase interface.
@@ -510,11 +507,11 @@ inline SetUpTearDownSuiteFuncType GetNotDefaultOrNull(
 
 template <typename T>
 //  Note that SuiteApiResolver inherits from T because
-//  SetUpTestSuite()/TearDownTestSuite() could be protected. Ths way
+//  SetUpTestSuite()/TearDownTestSuite() could be protected. This way
 //  SuiteApiResolver can access them.
 struct SuiteApiResolver : T {
   // testing::Test is only forward declared at this point. So we make it a
-  // dependend class for the compiler to be OK with it.
+  // dependent class for the compiler to be OK with it.
   using Test =
       typename std::conditional<sizeof(T) != 0, ::testing::Test, void>::type;
 
@@ -654,7 +651,8 @@ inline const char* SkipComma(const char* str) {
   if (comma == nullptr) {
     return nullptr;
   }
-  while (IsSpace(*(++comma))) {}
+  while (IsSpace(*(++comma))) {
+  }
   return comma;
 }
 
@@ -668,7 +666,7 @@ inline std::string GetPrefixUntilComma(const char* str) {
 // Splits a given string on a given delimiter, populating a given
 // vector with the fields.
 void SplitString(const ::std::string& str, char delimiter,
-                 ::std::vector< ::std::string>* dest);
+                 ::std::vector<::std::string>* dest);
 
 // The default argument to the template below for the case when the user does
 // not provide a name generator.
@@ -781,13 +779,13 @@ class TypeParameterizedTestSuite {
                        const std::vector<std::string>& type_names =
                            GenerateNames<DefaultNameGenerator, Types>()) {
     RegisterTypeParameterizedTestSuiteInstantiation(case_name);
-    std::string test_name = StripTrailingSpaces(
-        GetPrefixUntilComma(test_names));
+    std::string test_name =
+        StripTrailingSpaces(GetPrefixUntilComma(test_names));
     if (!state->TestExists(test_name)) {
       fprintf(stderr, "Failed to get code location for test %s.%s at %s.",
               case_name, test_name.c_str(),
-              FormatFileLocation(code_location.file.c_str(),
-                                 code_location.line).c_str());
+              FormatFileLocation(code_location.file.c_str(), code_location.line)
+                  .c_str());
       fflush(stderr);
       posix::Abort();
     }
@@ -831,8 +829,8 @@ class TypeParameterizedTestSuite<Fixture, internal::None, Types> {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(
-    UnitTest* unit_test, int skip_count);
+GTEST_API_ std::string GetCurrentOsStackTraceExceptTop(UnitTest* unit_test,
+                                                       int skip_count);
 
 // Helpers for suppressing warnings on unreachable code or constant
 // condition.
@@ -881,7 +879,8 @@ class GTEST_API_ Random {
 
  private:
   uint32_t state_;
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Random);
+  Random(const Random&) = delete;
+  Random& operator=(const Random&) = delete;
 };
 
 // Turns const U&, U&, const U, and U all into U.
@@ -954,7 +953,9 @@ IsContainer IsContainerTest(int /* dummy */) {
 
 typedef char IsNotContainer;
 template <class C>
-IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; }
+IsNotContainer IsContainerTest(long /* dummy */) {
+  return '\0';
+}
 
 // Trait to detect whether a type T is a hash table.
 // The heuristic used is that the type contains an inner type `hasher` and does
@@ -1017,11 +1018,13 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; }
+inline bool ArrayEq(const T& lhs, const U& rhs) {
+  return lhs == rhs;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
+inline bool ArrayEq(const T (&lhs)[N], const U (&rhs)[N]) {
   return internal::ArrayEq(lhs, N, rhs);
 }
 
@@ -1031,8 +1034,7 @@ inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) {
 template <typename T, typename U>
 bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
   for (size_t i = 0; i != size; i++) {
-    if (!internal::ArrayEq(lhs[i], rhs[i]))
-      return false;
+    if (!internal::ArrayEq(lhs[i], rhs[i])) return false;
   }
   return true;
 }
@@ -1042,8 +1044,7 @@ bool ArrayEq(const T* lhs, size_t size, const U* rhs) {
 template <typename Iter, typename Element>
 Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) {
   for (Iter it = begin; it != end; ++it) {
-    if (internal::ArrayEq(*it, elem))
-      return it;
+    if (internal::ArrayEq(*it, elem)) return it;
   }
   return end;
 }
@@ -1057,11 +1058,13 @@ void CopyArray(const T* from, size_t size, U* to);
 
 // This generic version is used when k is 0.
 template <typename T, typename U>
-inline void CopyArray(const T& from, U* to) { *to = from; }
+inline void CopyArray(const T& from, U* to) {
+  *to = from;
+}
 
 // This overload is used when k >= 1.
 template <typename T, typename U, size_t N>
-inline void CopyArray(const T(&from)[N], U(*to)[N]) {
+inline void CopyArray(const T (&from)[N], U (*to)[N]) {
   internal::CopyArray(from, N, *to);
 }
 
@@ -1114,8 +1117,7 @@ class NativeArray {
   }
 
   ~NativeArray() {
-    if (clone_ != &NativeArray::InitRef)
-      delete[] array_;
+    if (clone_ != &NativeArray::InitRef) delete[] array_;
   }
 
   // STL-style container methods.
@@ -1123,8 +1125,7 @@ class NativeArray {
   const_iterator begin() const { return array_; }
   const_iterator end() const { return array_ + size_; }
   bool operator==(const NativeArray& rhs) const {
-    return size() == rhs.size() &&
-        ArrayEq(begin(), size(), rhs.begin());
+    return size() == rhs.size() && ArrayEq(begin(), size(), rhs.begin());
   }
 
  private:
@@ -1335,9 +1336,9 @@ struct tuple_size<testing::internal::FlatTuple<Ts...>>
 #endif
 }  // namespace std
 
-#define GTEST_MESSAGE_AT_(file, line, message, result_type) \
-  ::testing::internal::AssertHelper(result_type, file, line, message) \
-    = ::testing::Message()
+#define GTEST_MESSAGE_AT_(file, line, message, result_type)             \
+  ::testing::internal::AssertHelper(result_type, file, line, message) = \
+      ::testing::Message()
 
 #define GTEST_MESSAGE_(message, result_type) \
   GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type)
@@ -1458,103 +1459,112 @@ class NeverThrown {
 
 #endif  // GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::TrueWithString gtest_msg{}) { \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
-    catch (...) { \
-      gtest_msg.value = "it throws."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \
-      fail(("Expected: " #statement " doesn't throw an exception.\n" \
-            "  Actual: " + gtest_msg.value).c_str())
-
-#define GTEST_TEST_ANY_THROW_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
-    bool gtest_caught_any = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (...) { \
-      gtest_caught_any = true; \
-    } \
-    if (!gtest_caught_any) { \
+#define GTEST_TEST_NO_THROW_(statement, fail)                            \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                          \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                 \
+    try {                                                                \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);         \
+    }                                                                    \
+    GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                           \
+    catch (...) {                                                        \
+      gtest_msg.value = "it throws.";                                    \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__);      \
+    }                                                                    \
+  } else                                                                 \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__)              \
+        : fail(("Expected: " #statement " doesn't throw an exception.\n" \
+                "  Actual: " +                                           \
+                gtest_msg.value)                                         \
+                   .c_str())
+
+#define GTEST_TEST_ANY_THROW_(statement, fail)                       \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                      \
+  if (::testing::internal::AlwaysTrue()) {                           \
+    bool gtest_caught_any = false;                                   \
+    try {                                                            \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);     \
+    } catch (...) {                                                  \
+      gtest_caught_any = true;                                       \
+    }                                                                \
+    if (!gtest_caught_any) {                                         \
       goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \
-      fail("Expected: " #statement " throws an exception.\n" \
-           "  Actual: it doesn't.")
-
+    }                                                                \
+  } else                                                             \
+    GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__)         \
+        : fail("Expected: " #statement                               \
+               " throws an exception.\n"                             \
+               "  Actual: it doesn't.")
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
 // representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (const ::testing::AssertionResult gtest_ar_ = \
-      ::testing::AssertionResult(expression)) \
-    ; \
-  else \
-    fail(::testing::internal::GetBoolAssertionFailureMessage(\
-        gtest_ar_, text, #actual, #expected).c_str())
-
-#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::AlwaysTrue()) { \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                       \
+  if (const ::testing::AssertionResult gtest_ar_ =                    \
+          ::testing::AssertionResult(expression))                     \
+    ;                                                                 \
+  else                                                                \
+    fail(::testing::internal::GetBoolAssertionFailureMessage(         \
+             gtest_ar_, text, #actual, #expected)                     \
+             .c_str())
+
+#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail)                          \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                                \
+  if (::testing::internal::AlwaysTrue()) {                                     \
     ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \
-    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \
-      fail("Expected: " #statement " doesn't generate new fatal " \
-           "failures in the current thread.\n" \
-           "  Actual: it does.")
+    GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);                 \
+    if (gtest_fatal_failure_checker.has_new_fatal_failure()) {                 \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__);            \
+    }                                                                          \
+  } else                                                                       \
+    GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__)                    \
+        : fail("Expected: " #statement                                         \
+               " doesn't generate new fatal "                                  \
+               "failures in the current thread.\n"                             \
+               "  Actual: it does.")
 
 // Expands to the name of the class that implements the given test.
 #define GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) \
   test_suite_name##_##test_name##_Test
 
 // Helper macro for defining tests.
-#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)      \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                \
-                "test_suite_name must not be empty");                         \
-  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                      \
-                "test_name must not be empty");                               \
-  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
-      : public parent_class {                                                 \
-   public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
-    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-    GTEST_DISALLOW_MOVE_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
-                                                           test_name));       \
-                                                                              \
-   private:                                                                   \
-    void TestBody() override;                                                 \
-    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;     \
-  };                                                                          \
-                                                                              \
-  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,          \
-                                                    test_name)::test_info_ =  \
-      ::testing::internal::MakeAndRegisterTestInfo(                           \
-          #test_suite_name, #test_name, nullptr, nullptr,                     \
-          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id), \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),         \
-          ::testing::internal::SuiteApiResolver<                              \
-              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),      \
-          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(    \
-              test_suite_name, test_name)>);                                  \
+#define GTEST_TEST_(test_suite_name, test_name, parent_class, parent_id)       \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_suite_name)) > 1,                 \
+                "test_suite_name must not be empty");                          \
+  static_assert(sizeof(GTEST_STRINGIFY_(test_name)) > 1,                       \
+                "test_name must not be empty");                                \
+  class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                     \
+      : public parent_class {                                                  \
+   public:                                                                     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;            \
+    ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default;  \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (const GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &) = delete;     \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        const GTEST_TEST_CLASS_NAME_(test_suite_name,                          \
+                                     test_name) &) = delete; /* NOLINT */      \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                         \
+    (GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) &&) noexcept = delete; \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name) & operator=(            \
+        GTEST_TEST_CLASS_NAME_(test_suite_name,                                \
+                               test_name) &&) noexcept = delete; /* NOLINT */  \
+                                                                               \
+   private:                                                                    \
+    void TestBody() override;                                                  \
+    static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;      \
+  };                                                                           \
+                                                                               \
+  ::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_suite_name,           \
+                                                    test_name)::test_info_ =   \
+      ::testing::internal::MakeAndRegisterTestInfo(                            \
+          #test_suite_name, #test_name, nullptr, nullptr,                      \
+          ::testing::internal::CodeLocation(__FILE__, __LINE__), (parent_id),  \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetSetUpCaseOrSuite(__FILE__, __LINE__),          \
+          ::testing::internal::SuiteApiResolver<                               \
+              parent_class>::GetTearDownCaseOrSuite(__FILE__, __LINE__),       \
+          new ::testing::internal::TestFactoryImpl<GTEST_TEST_CLASS_NAME_(     \
+              test_suite_name, test_name)>);                                   \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
 #endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index c2ef6e312..e7af2f904 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -27,10 +27,11 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Type and function utilities for implementing parameterized tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
@@ -46,19 +47,18 @@
 #include <utility>
 #include <vector>
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-printers.h"
 #include "gtest/gtest-test-part.h"
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
 
 namespace testing {
 // Input to a parameterized test name generator, describing a test parameter.
 // Consists of the parameter value and the integer parameter index.
 template <class ParamType>
 struct TestParamInfo {
-  TestParamInfo(const ParamType& a_param, size_t an_index) :
-    param(a_param),
-    index(an_index) {}
+  TestParamInfo(const ParamType& a_param, size_t an_index)
+      : param(a_param), index(an_index) {}
   ParamType param;
   size_t index;
 };
@@ -84,8 +84,10 @@ namespace internal {
 GTEST_API_ void ReportInvalidTestSuiteType(const char* test_suite_name,
                                            CodeLocation code_location);
 
-template <typename> class ParamGeneratorInterface;
-template <typename> class ParamGenerator;
+template <typename>
+class ParamGeneratorInterface;
+template <typename>
+class ParamGenerator;
 
 // Interface for iterating over elements provided by an implementation
 // of ParamGeneratorInterface<T>.
@@ -129,8 +131,7 @@ class ParamIterator {
   // ParamIterator assumes ownership of the impl_ pointer.
   ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {}
   ParamIterator& operator=(const ParamIterator& other) {
-    if (this != &other)
-      impl_.reset(other.impl_->Clone());
+    if (this != &other) impl_.reset(other.impl_->Clone());
     return *this;
   }
 
@@ -157,7 +158,7 @@ class ParamIterator {
  private:
   friend class ParamGenerator<T>;
   explicit ParamIterator(ParamIteratorInterface<T>* impl) : impl_(impl) {}
-  std::unique_ptr<ParamIteratorInterface<T> > impl_;
+  std::unique_ptr<ParamIteratorInterface<T>> impl_;
 };
 
 // ParamGeneratorInterface<T> is the binary interface to access generators
@@ -179,7 +180,7 @@ class ParamGeneratorInterface {
 // This class implements copy initialization semantics and the contained
 // ParamGeneratorInterface<T> instance is shared among all copies
 // of the original object. This is possible because that instance is immutable.
-template<typename T>
+template <typename T>
 class ParamGenerator {
  public:
   typedef ParamIterator<T> iterator;
@@ -196,7 +197,7 @@ class ParamGenerator {
   iterator end() const { return iterator(impl_->End()); }
 
  private:
-  std::shared_ptr<const ParamGeneratorInterface<T> > impl_;
+  std::shared_ptr<const ParamGeneratorInterface<T>> impl_;
 };
 
 // Generates values from a range of two comparable values. Can be used to
@@ -207,8 +208,10 @@ template <typename T, typename IncrementT>
 class RangeGenerator : public ParamGeneratorInterface<T> {
  public:
   RangeGenerator(T begin, T end, IncrementT step)
-      : begin_(begin), end_(end),
-        step_(step), end_index_(CalculateEndIndex(begin, end, step)) {}
+      : begin_(begin),
+        end_(end),
+        step_(step),
+        end_index_(CalculateEndIndex(begin, end, step)) {}
   ~RangeGenerator() override {}
 
   ParamIteratorInterface<T>* Begin() const override {
@@ -251,7 +254,9 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
    private:
     Iterator(const Iterator& other)
         : ParamIteratorInterface<T>(),
-          base_(other.base_), value_(other.value_), index_(other.index_),
+          base_(other.base_),
+          value_(other.value_),
+          index_(other.index_),
           step_(other.step_) {}
 
     // No implementation - assignment is unsupported.
@@ -263,12 +268,10 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
     const IncrementT step_;
   };  // class RangeGenerator::Iterator
 
-  static int CalculateEndIndex(const T& begin,
-                               const T& end,
+  static int CalculateEndIndex(const T& begin, const T& end,
                                const IncrementT& step) {
     int end_index = 0;
-    for (T i = begin; i < end; i = static_cast<T>(i + step))
-      end_index++;
+    for (T i = begin; i < end; i = static_cast<T>(i + step)) end_index++;
     return end_index;
   }
 
@@ -283,7 +286,6 @@ class RangeGenerator : public ParamGeneratorInterface<T> {
   const int end_index_;
 };  // class RangeGenerator
 
-
 // Generates values from a pair of STL-style iterators. Used in the
 // ValuesIn() function. The elements are copied from the source range
 // since the source can be located on the stack, and the generator
@@ -341,13 +343,13 @@ class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface<T> {
           << "The program attempted to compare iterators "
           << "from different generators." << std::endl;
       return iterator_ ==
-          CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
+             CheckedDowncastToActualType<const Iterator>(&other)->iterator_;
     }
 
    private:
     Iterator(const Iterator& other)
-          // The explicit constructor call suppresses a false warning
-          // emitted by gcc when supplied with the -Wextra option.
+        // The explicit constructor call suppresses a false warning
+        // emitted by gcc when supplied with the -Wextra option.
         : ParamIteratorInterface<T>(),
           base_(other.base_),
           iterator_(other.iterator_) {}
@@ -394,8 +396,8 @@ template <class TestClass>
 class ParameterizedTestFactory : public TestFactoryBase {
  public:
   typedef typename TestClass::ParamType ParamType;
-  explicit ParameterizedTestFactory(ParamType parameter) :
-      parameter_(parameter) {}
+  explicit ParameterizedTestFactory(ParamType parameter)
+      : parameter_(parameter) {}
   Test* CreateTest() override {
     TestClass::SetParam(&parameter_);
     return new TestClass();
@@ -404,7 +406,8 @@ class ParameterizedTestFactory : public TestFactoryBase {
  private:
   const ParamType parameter_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory);
+  ParameterizedTestFactory(const ParameterizedTestFactory&) = delete;
+  ParameterizedTestFactory& operator=(const ParameterizedTestFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -440,7 +443,8 @@ class TestMetaFactory
   }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory);
+  TestMetaFactory(const TestMetaFactory&) = delete;
+  TestMetaFactory& operator=(const TestMetaFactory&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -471,7 +475,10 @@ class ParameterizedTestSuiteInfoBase {
   ParameterizedTestSuiteInfoBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfoBase);
+  ParameterizedTestSuiteInfoBase(const ParameterizedTestSuiteInfoBase&) =
+      delete;
+  ParameterizedTestSuiteInfoBase& operator=(
+      const ParameterizedTestSuiteInfoBase&) = delete;
 };
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
@@ -547,8 +554,8 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
          test_it != tests_.end(); ++test_it) {
       std::shared_ptr<TestInfo> test_info = *test_it;
       for (typename InstantiationContainer::iterator gen_it =
-               instantiations_.begin(); gen_it != instantiations_.end();
-               ++gen_it) {
+               instantiations_.begin();
+           gen_it != instantiations_.end(); ++gen_it) {
         const std::string& instantiation_name = gen_it->name;
         ParamGenerator<ParamType> generator((*gen_it->generator)());
         ParamNameGeneratorFunc* name_func = gen_it->name_func;
@@ -556,7 +563,7 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
         int line = gen_it->line;
 
         std::string test_suite_name;
-        if ( !instantiation_name.empty() )
+        if (!instantiation_name.empty())
           test_suite_name = instantiation_name + "/";
         test_suite_name += test_info->test_suite_base_name;
 
@@ -569,17 +576,16 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
           Message test_name_stream;
 
-          std::string param_name = name_func(
-              TestParamInfo<ParamType>(*param_it, i));
+          std::string param_name =
+              name_func(TestParamInfo<ParamType>(*param_it, i));
 
           GTEST_CHECK_(IsValidParamName(param_name))
               << "Parameterized test name '" << param_name
-              << "' is invalid, in " << file
-              << " line " << line << std::endl;
+              << "' is invalid, in " << file << " line " << line << std::endl;
 
           GTEST_CHECK_(test_param_names.count(param_name) == 0)
-              << "Duplicate parameterized test name '" << param_name
-              << "', in " << file << " line " << line << std::endl;
+              << "Duplicate parameterized test name '" << param_name << "', in "
+              << file << " line " << line << std::endl;
 
           test_param_names.insert(param_name);
 
@@ -596,15 +602,15 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
               test_info->test_meta_factory->CreateTestFactory(*param_it));
         }  // for param_it
-      }  // for gen_it
-    }  // for test_it
+      }    // for gen_it
+    }      // for test_it
 
     if (!generated_instantiations) {
       // There are no generaotrs, or they all generate nothing ...
       InsertSyntheticTestCase(GetTestSuiteName(), code_location_,
                               !tests_.empty());
     }
-  }    // RegisterTests
+  }  // RegisterTests
 
  private:
   // LocalTestInfo structure keeps information about a single test registered
@@ -620,42 +626,39 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
-    const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const std::unique_ptr<TestMetaFactoryBase<ParamType>> test_meta_factory;
     const CodeLocation code_location;
   };
-  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
+  using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo>>;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
   //  <Instantiation name, Sequence generator creation function,
   //     Name generator function, Source file, Source line>
   struct InstantiationInfo {
-      InstantiationInfo(const std::string &name_in,
-                        GeneratorCreationFunc* generator_in,
-                        ParamNameGeneratorFunc* name_func_in,
-                        const char* file_in,
-                        int line_in)
-          : name(name_in),
-            generator(generator_in),
-            name_func(name_func_in),
-            file(file_in),
-            line(line_in) {}
-
-      std::string name;
-      GeneratorCreationFunc* generator;
-      ParamNameGeneratorFunc* name_func;
-      const char* file;
-      int line;
+    InstantiationInfo(const std::string& name_in,
+                      GeneratorCreationFunc* generator_in,
+                      ParamNameGeneratorFunc* name_func_in, const char* file_in,
+                      int line_in)
+        : name(name_in),
+          generator(generator_in),
+          name_func(name_func_in),
+          file(file_in),
+          line(line_in) {}
+
+    std::string name;
+    GeneratorCreationFunc* generator;
+    ParamNameGeneratorFunc* name_func;
+    const char* file;
+    int line;
   };
   typedef ::std::vector<InstantiationInfo> InstantiationContainer;
 
   static bool IsValidParamName(const std::string& name) {
     // Check for empty string
-    if (name.empty())
-      return false;
+    if (name.empty()) return false;
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!IsAlNum(name[index]) && name[index] != '_')
-        return false;
+      if (!IsAlNum(name[index]) && name[index] != '_') return false;
     }
 
     return true;
@@ -666,7 +669,9 @@ class ParameterizedTestSuiteInfo : public ParameterizedTestSuiteInfoBase {
   TestInfoContainer tests_;
   InstantiationContainer instantiations_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteInfo);
+  ParameterizedTestSuiteInfo(const ParameterizedTestSuiteInfo&) = delete;
+  ParameterizedTestSuiteInfo& operator=(const ParameterizedTestSuiteInfo&) =
+      delete;
 };  // class ParameterizedTestSuiteInfo
 
 //  Legacy API is deprecated but still available
@@ -709,7 +714,7 @@ class ParameterizedTestSuiteRegistry {
           // type we are looking for, so we downcast it to that type
           // without further checks.
           typed_test_info = CheckedDowncastToActualType<
-              ParameterizedTestSuiteInfo<TestSuite> >(test_suite_info);
+              ParameterizedTestSuiteInfo<TestSuite>>(test_suite_info);
         }
         break;
       }
@@ -741,7 +746,10 @@ class ParameterizedTestSuiteRegistry {
 
   TestSuiteInfoContainer test_suite_infos_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestSuiteRegistry);
+  ParameterizedTestSuiteRegistry(const ParameterizedTestSuiteRegistry&) =
+      delete;
+  ParameterizedTestSuiteRegistry& operator=(
+      const ParameterizedTestSuiteRegistry&) = delete;
 };
 
 // Keep track of what type-parameterized test suite are defined and
@@ -836,7 +844,8 @@ class CartesianProductGenerator
       : public ParamIteratorInterface<ParamType> {
    public:
     IteratorImpl(const ParamGeneratorInterface<ParamType>* base,
-             const std::tuple<ParamGenerator<T>...>& generators, bool is_end)
+                 const std::tuple<ParamGenerator<T>...>& generators,
+                 bool is_end)
         : base_(base),
           begin_(std::get<I>(generators).begin()...),
           end_(std::get<I>(generators).end()...),
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index dd845915e..f025db76a 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file defines the GTEST_OS_* macro.
@@ -37,70 +37,72 @@
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
-# define GTEST_OS_CYGWIN 1
-# elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
-#  define GTEST_OS_WINDOWS_MINGW 1
-#  define GTEST_OS_WINDOWS 1
+#define GTEST_OS_CYGWIN 1
+#elif defined(__MINGW__) || defined(__MINGW32__) || defined(__MINGW64__)
+#define GTEST_OS_WINDOWS_MINGW 1
+#define GTEST_OS_WINDOWS 1
 #elif defined _WIN32
-# define GTEST_OS_WINDOWS 1
-# ifdef _WIN32_WCE
-#  define GTEST_OS_WINDOWS_MOBILE 1
-# elif defined(WINAPI_FAMILY)
-#  include <winapifamily.h>
-#  if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
-#   define GTEST_OS_WINDOWS_RT 1
-#  elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
-#   define GTEST_OS_WINDOWS_PHONE 1
-#   define GTEST_OS_WINDOWS_TV_TITLE 1
-#  else
-    // WINAPI_FAMILY defined but no known partition matched.
-    // Default to desktop.
-#   define GTEST_OS_WINDOWS_DESKTOP 1
-#  endif
-# else
-#  define GTEST_OS_WINDOWS_DESKTOP 1
-# endif  // _WIN32_WCE
+#define GTEST_OS_WINDOWS 1
+#ifdef _WIN32_WCE
+#define GTEST_OS_WINDOWS_MOBILE 1
+#elif defined(WINAPI_FAMILY)
+#include <winapifamily.h>
+#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP)
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_PHONE_APP)
+#define GTEST_OS_WINDOWS_PHONE 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP)
+#define GTEST_OS_WINDOWS_RT 1
+#elif WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_TV_TITLE)
+#define GTEST_OS_WINDOWS_PHONE 1
+#define GTEST_OS_WINDOWS_TV_TITLE 1
+#else
+// WINAPI_FAMILY defined but no known partition matched.
+// Default to desktop.
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif
+#else
+#define GTEST_OS_WINDOWS_DESKTOP 1
+#endif  // _WIN32_WCE
 #elif defined __OS2__
-# define GTEST_OS_OS2 1
+#define GTEST_OS_OS2 1
 #elif defined __APPLE__
-# define GTEST_OS_MAC 1
-# include <TargetConditionals.h>
-# if TARGET_OS_IPHONE
-#  define GTEST_OS_IOS 1
-# endif
+#define GTEST_OS_MAC 1
+#include <TargetConditionals.h>
+#if TARGET_OS_IPHONE
+#define GTEST_OS_IOS 1
+#endif
 #elif defined __DragonFly__
-# define GTEST_OS_DRAGONFLY 1
+#define GTEST_OS_DRAGONFLY 1
 #elif defined __FreeBSD__
-# define GTEST_OS_FREEBSD 1
+#define GTEST_OS_FREEBSD 1
 #elif defined __Fuchsia__
-# define GTEST_OS_FUCHSIA 1
+#define GTEST_OS_FUCHSIA 1
+#elif defined(__GNU__)
+#define GTEST_OS_GNU_HURD 1
 #elif defined(__GLIBC__) && defined(__FreeBSD_kernel__)
-# define GTEST_OS_GNU_KFREEBSD 1
+#define GTEST_OS_GNU_KFREEBSD 1
 #elif defined __linux__
-# define GTEST_OS_LINUX 1
-# if defined __ANDROID__
-#  define GTEST_OS_LINUX_ANDROID 1
-# endif
+#define GTEST_OS_LINUX 1
+#if defined __ANDROID__
+#define GTEST_OS_LINUX_ANDROID 1
+#endif
 #elif defined __MVS__
-# define GTEST_OS_ZOS 1
+#define GTEST_OS_ZOS 1
 #elif defined(__sun) && defined(__SVR4)
-# define GTEST_OS_SOLARIS 1
+#define GTEST_OS_SOLARIS 1
 #elif defined(_AIX)
-# define GTEST_OS_AIX 1
+#define GTEST_OS_AIX 1
 #elif defined(__hpux)
-# define GTEST_OS_HPUX 1
+#define GTEST_OS_HPUX 1
 #elif defined __native_client__
-# define GTEST_OS_NACL 1
+#define GTEST_OS_NACL 1
 #elif defined __NetBSD__
-# define GTEST_OS_NETBSD 1
+#define GTEST_OS_NETBSD 1
 #elif defined __OpenBSD__
-# define GTEST_OS_OPENBSD 1
+#define GTEST_OS_OPENBSD 1
 #elif defined __QNX__
-# define GTEST_OS_QNX 1
+#define GTEST_OS_QNX 1
 #elif defined(__HAIKU__)
 #define GTEST_OS_HAIKU 1
 #elif defined ESP8266
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 0953a781c..0003d2765 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // Low-level types and utilities for porting Google Test to various
 // platforms.  All macros ending with _ and symbols defined in an
 // internal namespace are subject to change without notice.  Code
@@ -38,7 +38,9 @@
 // files are expected to #include this.  Therefore, it cannot #include
 // any other Google Test header.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
@@ -116,6 +118,7 @@
 //   GTEST_OS_DRAGONFLY - DragonFlyBSD
 //   GTEST_OS_FREEBSD  - FreeBSD
 //   GTEST_OS_FUCHSIA  - Fuchsia
+//   GTEST_OS_GNU_HURD - GNU/Hurd
 //   GTEST_OS_GNU_KFREEBSD - GNU/kFreeBSD
 //   GTEST_OS_HAIKU    - Haiku
 //   GTEST_OS_HPUX     - HP-UX
@@ -167,7 +170,7 @@
 //   GTEST_HAS_TYPED_TEST   - typed tests
 //   GTEST_HAS_TYPED_TEST_P - type-parameterized tests
 //   GTEST_IS_THREADSAFE    - Google Test is thread-safe.
-//   GOOGLETEST_CM0007 DO NOT DELETE
+//   GTEST_USES_RE2         - the RE2 regular expression library is used
 //   GTEST_USES_POSIX_RE    - enhanced POSIX regex is used. Do not confuse with
 //                            GTEST_HAS_POSIX_RE (see above) which users can
 //                            define themselves.
@@ -190,10 +193,6 @@
 //   GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning.
 //   GTEST_ATTRIBUTE_UNUSED_  - declares that a class' instances or a
 //                              variable don't have to be used.
-//   GTEST_DISALLOW_ASSIGN_   - disables copy operator=.
-//   GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=.
-//   GTEST_DISALLOW_MOVE_ASSIGN_   - disables move operator=.
-//   GTEST_DISALLOW_MOVE_AND_ASSIGN_ - disables move ctor and operator=.
 //   GTEST_MUST_USE_RESULT_   - declares that a function's result must be used.
 //   GTEST_INTENTIONAL_CONST_COND_PUSH_ - start code section where MSVC C4127 is
 //                                        suppressed (constant conditional).
@@ -217,11 +216,13 @@
 //                            - synchronization primitives.
 //
 // Regular expressions:
-//   RE             - a simple regular expression class using the POSIX
-//                    Extended Regular Expression syntax on UNIX-like platforms
-//                    GOOGLETEST_CM0008 DO NOT DELETE
-//                    or a reduced regular exception syntax on other
-//                    platforms, including Windows.
+//   RE             - a simple regular expression class using
+//                     1) the RE2 syntax on all platforms when built with RE2
+//                        and Abseil as dependencies
+//                     2) the POSIX Extended Regular Expression syntax on
+//                        UNIX-like platforms,
+//                     3) A reduced regular exception syntax on other platforms,
+//                        including Windows.
 // Logging:
 //   GTEST_LOG_()   - logs messages at the specified severity level.
 //   LogToStderr()  - directs all log messages to stderr.
@@ -241,8 +242,6 @@
 //   BiggestInt     - the biggest signed integer type.
 //
 // Command-line utilities:
-//   GTEST_DECLARE_*()  - declares a flag.
-//   GTEST_DEFINE_*()   - defines a flag.
 //   GetInjectableArgvs() - returns the command line as a vector of strings.
 //
 // Environment variable utilities:
@@ -263,48 +262,55 @@
 #include <string.h>
 
 #include <cerrno>
+// #include <condition_variable>  // Guarded by GTEST_IS_THREADSAFE below
 #include <cstdint>
+#include <iostream>
 #include <limits>
+#include <locale>
+#include <memory>
+#include <string>
+// #include <mutex>  // Guarded by GTEST_IS_THREADSAFE below
+#include <tuple>
 #include <type_traits>
+#include <vector>
 
 #ifndef _WIN32_WCE
-# include <sys/types.h>
-# include <sys/stat.h>
+#include <sys/stat.h>
+#include <sys/types.h>
 #endif  // !_WIN32_WCE
 
 #if defined __APPLE__
-# include <AvailabilityMacros.h>
-# include <TargetConditionals.h>
+#include <AvailabilityMacros.h>
+#include <TargetConditionals.h>
 #endif
 
-#include <iostream>  // NOLINT
-#include <locale>
-#include <memory>
-#include <string>  // NOLINT
-#include <tuple>
-#include <vector>  // NOLINT
-
 #include "gtest/internal/custom/gtest-port.h"
 #include "gtest/internal/gtest-port-arch.h"
 
+#if GTEST_HAS_ABSL
+#include "absl/flags/declare.h"
+#include "absl/flags/flag.h"
+#include "absl/flags/reflection.h"
+#endif
+
 #if !defined(GTEST_DEV_EMAIL_)
-# define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
-# define GTEST_FLAG_PREFIX_ "gtest_"
-# define GTEST_FLAG_PREFIX_DASH_ "gtest-"
-# define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
-# define GTEST_NAME_ "Google Test"
-# define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
+#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com"
+#define GTEST_FLAG_PREFIX_ "gtest_"
+#define GTEST_FLAG_PREFIX_DASH_ "gtest-"
+#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_"
+#define GTEST_NAME_ "Google Test"
+#define GTEST_PROJECT_URL_ "https://github.com/google/googletest/"
 #endif  // !defined(GTEST_DEV_EMAIL_)
 
 #if !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
-# define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
+#define GTEST_INIT_GOOGLE_TEST_NAME_ "testing::InitGoogleTest"
 #endif  // !defined(GTEST_INIT_GOOGLE_TEST_NAME_)
 
 // Determines the version of gcc that is used to compile this.
 #ifdef __GNUC__
 // 40302 means version 4.3.2.
-# define GTEST_GCC_VER_ \
-    (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__)
+#define GTEST_GCC_VER_ \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
 #endif  // __GNUC__
 
 // Macros for disabling Microsoft Visual C++ warnings.
@@ -313,41 +319,37 @@
 //   /* code that triggers warnings C4800 and C4385 */
 //   GTEST_DISABLE_MSC_WARNINGS_POP_()
 #if defined(_MSC_VER)
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
-    __pragma(warning(push))                        \
-    __pragma(warning(disable: warnings))
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()          \
-    __pragma(warning(pop))
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings) \
+  __pragma(warning(push)) __pragma(warning(disable : warnings))
+#define GTEST_DISABLE_MSC_WARNINGS_POP_() __pragma(warning(pop))
 #else
 // Not all compilers are MSVC
-# define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
-# define GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_WARNINGS_PUSH_(warnings)
+#define GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Clang on Windows does not understand MSVC's pragma warning.
 // We need clang-specific way to disable function deprecation warning.
 #ifdef __clang__
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                         \
-    _Pragma("clang diagnostic push")                                  \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
-    _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
-#define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    _Pragma("clang diagnostic pop")
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_()                            \
+  _Pragma("clang diagnostic push")                                      \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") \
+          _Pragma("clang diagnostic ignored \"-Wdeprecated-implementations\"")
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() _Pragma("clang diagnostic pop")
 #else
-# define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
-# define GTEST_DISABLE_MSC_DEPRECATED_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_DISABLE_MSC_DEPRECATED_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4996)
+#define GTEST_DISABLE_MSC_DEPRECATED_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 #endif
 
 // Brings in definitions for functions used in the testing::internal::posix
 // namespace (read, write, close, chdir, isatty, stat). We do not currently
 // use them on Windows Mobile.
 #if GTEST_OS_WINDOWS
-# if !GTEST_OS_WINDOWS_MOBILE
-#  include <direct.h>
-#  include <io.h>
-# endif
+#if !GTEST_OS_WINDOWS_MOBILE
+#include <direct.h>
+#include <io.h>
+#endif
 // In order to avoid having to include <windows.h>, use forward declaration
 #if GTEST_OS_WINDOWS_MINGW && !defined(__MINGW64_VERSION_MAJOR)
 // MinGW defined _CRITICAL_SECTION and _RTL_CRITICAL_SECTION as two
@@ -367,68 +369,55 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
 // mentioned above.
-# include <unistd.h>
-# include <strings.h>
+#include <strings.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_LINUX_ANDROID
 // Used to define __ANDROID_API__ matching the target NDK API level.
-#  include <android/api-level.h>  // NOLINT
+#include <android/api-level.h>  // NOLINT
 #endif
 
 // Defines this to true if and only if Google Test can use POSIX regular
 // expressions.
 #ifndef GTEST_HAS_POSIX_RE
-# if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
 // On Android, <regex.h> is only available starting with Gingerbread.
-#  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
-# else
+#define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
+#else
 #define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
-# endif
+#endif
 #endif
 
-#if GTEST_USES_PCRE
-// The appropriate headers have already been included.
-
+// Select the regular expression implementation.
+#if GTEST_HAS_ABSL
+// When using Abseil, RE2 is required.
+#include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#define GTEST_USES_RE2 1
 #elif GTEST_HAS_POSIX_RE
-
-// On some platforms, <regex.h> needs someone to define size_t, and
-// won't compile otherwise.  We can #include it here as we already
-// included <stdlib.h>, which is guaranteed to define size_t through
-// <stddef.h>.
-# include <regex.h>  // NOLINT
-
-# define GTEST_USES_POSIX_RE 1
-
-#elif GTEST_OS_WINDOWS
-
-// <regex.h> is not available on Windows.  Use our own simple regex
-// implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
+#include <regex.h>  // NOLINT
+#define GTEST_USES_POSIX_RE 1
 #else
-
-// <regex.h> may not be available on this platform.  Use our own
-// simple regex implementation instead.
-# define GTEST_USES_SIMPLE_RE 1
-
-#endif  // GTEST_USES_PCRE
+// Use our own simple regex implementation.
+#define GTEST_USES_SIMPLE_RE 1
+#endif
 
 #ifndef GTEST_HAS_EXCEPTIONS
 // The user didn't tell us whether exceptions are enabled, so we need
 // to figure it out.
-# if defined(_MSC_VER) && defined(_CPPUNWIND)
+#if defined(_MSC_VER) && defined(_CPPUNWIND)
 // MSVC defines _CPPUNWIND to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__BORLANDC__)
 // C++Builder's implementation of the STL uses the _HAS_EXCEPTIONS
 // macro to enable exceptions, so we'll do the same.
 // Assumes that exceptions are enabled by default.
-#  ifndef _HAS_EXCEPTIONS
-#   define _HAS_EXCEPTIONS 1
-#  endif  // _HAS_EXCEPTIONS
-#  define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
-# elif defined(__clang__)
+#ifndef _HAS_EXCEPTIONS
+#define _HAS_EXCEPTIONS 1
+#endif  // _HAS_EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS
+#elif defined(__clang__)
 // clang defines __EXCEPTIONS if and only if exceptions are enabled before clang
 // 220714, but if and only if cleanups are enabled after that. In Obj-C++ files,
 // there can be cleanups for ObjC exceptions which also need cleanups, even if
@@ -437,27 +426,27 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // cleanups prior to that. To reliably check for C++ exception availability with
 // clang, check for
 // __EXCEPTIONS && __has_feature(cxx_exceptions).
-#  define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
-# elif defined(__GNUC__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS (__EXCEPTIONS && __has_feature(cxx_exceptions))
+#elif defined(__GNUC__) && __EXCEPTIONS
 // gcc defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__SUNPRO_CC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__SUNPRO_CC)
 // Sun Pro CC supports exceptions.  However, there is no compile-time way of
 // detecting whether they are enabled or not.  Therefore, we assume that
 // they are enabled unless the user tells us otherwise.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__IBMCPP__) && __EXCEPTIONS
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__IBMCPP__) && __EXCEPTIONS
 // xlC defines __EXCEPTIONS to 1 if and only if exceptions are enabled.
-#  define GTEST_HAS_EXCEPTIONS 1
-# elif defined(__HP_aCC)
+#define GTEST_HAS_EXCEPTIONS 1
+#elif defined(__HP_aCC)
 // Exception handling is in effect by default in HP aCC compiler. It has to
 // be turned of by +noeh compiler option if desired.
-#  define GTEST_HAS_EXCEPTIONS 1
-# else
+#define GTEST_HAS_EXCEPTIONS 1
+#else
 // For other compilers, we assume exceptions are disabled to be
 // conservative.
-#  define GTEST_HAS_EXCEPTIONS 0
-# endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+#define GTEST_HAS_EXCEPTIONS 0
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
 #endif  // GTEST_HAS_EXCEPTIONS
 
 #ifndef GTEST_HAS_STD_WSTRING
@@ -477,63 +466,62 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // The user didn't tell us whether RTTI is enabled, so we need to
 // figure it out.
 
-# ifdef _MSC_VER
+#ifdef _MSC_VER
 
 #ifdef _CPPRTTI  // MSVC defines this macro if and only if RTTI is enabled.
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
 // Starting with version 4.3.2, gcc defines __GXX_RTTI if and only if RTTI is
 // enabled.
-# elif defined(__GNUC__)
+#elif defined(__GNUC__)
 
-#  ifdef __GXX_RTTI
+#ifdef __GXX_RTTI
 // When building against STLport with the Android NDK and with
 // -frtti -fno-exceptions, the build fails at link time with undefined
 // references to __cxa_bad_typeid. Note sure if STL or toolchain bug,
 // so disable RTTI when detected.
-#   if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && \
-       !defined(__EXCEPTIONS)
-#    define GTEST_HAS_RTTI 0
-#   else
-#    define GTEST_HAS_RTTI 1
-#   endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif  // __GXX_RTTI
+#if GTEST_OS_LINUX_ANDROID && defined(_STLPORT_MAJOR) && !defined(__EXCEPTIONS)
+#define GTEST_HAS_RTTI 0
+#else
+#define GTEST_HAS_RTTI 1
+#endif  // GTEST_OS_LINUX_ANDROID && __STLPORT_MAJOR && !__EXCEPTIONS
+#else
+#define GTEST_HAS_RTTI 0
+#endif  // __GXX_RTTI
 
 // Clang defines __GXX_RTTI starting with version 3.0, but its manual recommends
 // using has_feature instead. has_feature(cxx_rtti) is supported since 2.7, the
 // first version with C++ support.
-# elif defined(__clang__)
+#elif defined(__clang__)
 
-#  define GTEST_HAS_RTTI __has_feature(cxx_rtti)
+#define GTEST_HAS_RTTI __has_feature(cxx_rtti)
 
 // Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if
 // both the typeid and dynamic_cast features are present.
-# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
+#elif defined(__IBMCPP__) && (__IBMCPP__ >= 900)
 
-#  ifdef __RTTI_ALL__
-#   define GTEST_HAS_RTTI 1
-#  else
-#   define GTEST_HAS_RTTI 0
-#  endif
+#ifdef __RTTI_ALL__
+#define GTEST_HAS_RTTI 1
+#else
+#define GTEST_HAS_RTTI 0
+#endif
 
-# else
+#else
 
 // For all other compilers, we assume RTTI is enabled.
-#  define GTEST_HAS_RTTI 1
+#define GTEST_HAS_RTTI 1
 
-# endif  // _MSC_VER
+#endif  // _MSC_VER
 
 #endif  // GTEST_HAS_RTTI
 
 // It's this header's responsibility to #include <typeinfo> when RTTI
 // is enabled.
 #if GTEST_HAS_RTTI
-# include <typeinfo>
+#include <typeinfo>
 #endif
 
 // Determines whether Google Test can use the pthreads library.
@@ -547,16 +535,16 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
   (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX || GTEST_OS_QNX ||          \
    GTEST_OS_FREEBSD || GTEST_OS_NACL || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA || \
    GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_OPENBSD ||          \
-   GTEST_OS_HAIKU)
+   GTEST_OS_HAIKU || GTEST_OS_GNU_HURD)
 #endif  // GTEST_HAS_PTHREAD
 
 #if GTEST_HAS_PTHREAD
 // gtest-port.h guarantees to #include <pthread.h> when GTEST_HAS_PTHREAD is
 // true.
-# include <pthread.h>  // NOLINT
+#include <pthread.h>  // NOLINT
 
 // For timespec and nanosleep, used below.
-# include <time.h>  // NOLINT
+#include <time.h>  // NOLINT
 #endif
 
 // Determines whether clone(2) is supported.
@@ -566,24 +554,23 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_CLONE
 // The user didn't tell us, so we need to figure it out.
 
-# if GTEST_OS_LINUX && !defined(__ia64__)
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX && !defined(__ia64__)
+#if GTEST_OS_LINUX_ANDROID
 // On Android, clone() became available at different API levels for each 32-bit
 // architecture.
-#    if defined(__LP64__) || \
-        (defined(__arm__) && __ANDROID_API__ >= 9) || \
-        (defined(__mips__) && __ANDROID_API__ >= 12) || \
-        (defined(__i386__) && __ANDROID_API__ >= 17)
-#     define GTEST_HAS_CLONE 1
-#    else
-#     define GTEST_HAS_CLONE 0
-#    endif
-#  else
-#   define GTEST_HAS_CLONE 1
-#  endif
-# else
-#  define GTEST_HAS_CLONE 0
-# endif  // GTEST_OS_LINUX && !defined(__ia64__)
+#if defined(__LP64__) || (defined(__arm__) && __ANDROID_API__ >= 9) || \
+    (defined(__mips__) && __ANDROID_API__ >= 12) ||                    \
+    (defined(__i386__) && __ANDROID_API__ >= 17)
+#define GTEST_HAS_CLONE 1
+#else
+#define GTEST_HAS_CLONE 0
+#endif
+#else
+#define GTEST_HAS_CLONE 1
+#endif
+#else
+#define GTEST_HAS_CLONE 0
+#endif  // GTEST_OS_LINUX && !defined(__ia64__)
 
 #endif  // GTEST_HAS_CLONE
 
@@ -594,10 +581,10 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
     GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
-#  define GTEST_HAS_STREAM_REDIRECTION 0
-# else
-#  define GTEST_HAS_STREAM_REDIRECTION 1
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#define GTEST_HAS_STREAM_REDIRECTION 0
+#else
+#define GTEST_HAS_STREAM_REDIRECTION 1
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
 // Determines whether to support death tests.
@@ -607,8 +594,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
      (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER) || GTEST_OS_WINDOWS_MINGW ||  \
      GTEST_OS_AIX || GTEST_OS_HPUX || GTEST_OS_OPENBSD || GTEST_OS_QNX || \
      GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_FUCHSIA ||           \
-     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU)
-# define GTEST_HAS_DEATH_TEST 1
+     GTEST_OS_DRAGONFLY || GTEST_OS_GNU_KFREEBSD || GTEST_OS_HAIKU ||     \
+     GTEST_OS_GNU_HURD)
+#define GTEST_HAS_DEATH_TEST 1
 #endif
 
 // Determines whether to support type-driven tests.
@@ -617,8 +605,8 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Sun Pro CC, IBM Visual Age, and HP aCC support.
 #if defined(__GNUC__) || defined(_MSC_VER) || defined(__SUNPRO_CC) || \
     defined(__IBMCPP__) || defined(__HP_aCC)
-# define GTEST_HAS_TYPED_TEST 1
-# define GTEST_HAS_TYPED_TEST_P 1
+#define GTEST_HAS_TYPED_TEST 1
+#define GTEST_HAS_TYPED_TEST_P 1
 #endif
 
 // Determines whether the system compiler uses UTF-16 for encoding wide strings.
@@ -627,8 +615,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 // Determines whether test results can be streamed to a socket.
 #if GTEST_OS_LINUX || GTEST_OS_GNU_KFREEBSD || GTEST_OS_DRAGONFLY || \
-    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# define GTEST_CAN_STREAM_RESULTS_ 1
+    GTEST_OS_FREEBSD || GTEST_OS_NETBSD || GTEST_OS_OPENBSD ||       \
+    GTEST_OS_GNU_HURD
+#define GTEST_CAN_STREAM_RESULTS_ 1
 #endif
 
 // Defines some utility macros.
@@ -642,9 +631,12 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 // The "switch (0) case 0:" idiom is used to suppress this.
 #ifdef __INTEL_COMPILER
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_
 #else
-# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default:  // NOLINT
+#define GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
+  switch (0)                          \
+  case 0:                             \
+  default:  // NOLINT
 #endif
 
 // Use this annotation at the end of a struct/class definition to
@@ -659,55 +651,32 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // Also use it after a variable or parameter declaration to tell the
 // compiler the variable/parameter does not have to be used.
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
 #elif defined(__clang__)
-# if __has_attribute(unused)
-#  define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused))
-# endif
+#if __has_attribute(unused)
+#define GTEST_ATTRIBUTE_UNUSED_ __attribute__((unused))
+#endif
 #endif
 #ifndef GTEST_ATTRIBUTE_UNUSED_
-# define GTEST_ATTRIBUTE_UNUSED_
+#define GTEST_ATTRIBUTE_UNUSED_
 #endif
 
 // Use this annotation before a function that takes a printf format string.
 #if (defined(__GNUC__) || defined(__clang__)) && !defined(COMPILER_ICC)
-# if defined(__MINGW_PRINTF_FORMAT)
+#if defined(__MINGW_PRINTF_FORMAT)
 // MinGW has two different printf implementations. Ensure the format macro
 // matches the selected implementation. See
 // https://sourceforge.net/p/mingw-w64/wiki2/gnu%20printf/.
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__MINGW_PRINTF_FORMAT, string_index, \
-                                 first_to_check)))
-# else
-#  define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
-       __attribute__((__format__(__printf__, string_index, first_to_check)))
-# endif
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((                                             \
+      __format__(__MINGW_PRINTF_FORMAT, string_index, first_to_check)))
 #else
-# define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#else
+#define GTEST_ATTRIBUTE_PRINTF_(string_index, first_to_check)
 #endif
-
-
-// A macro to disallow copy operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_ASSIGN_(type) \
-  type& operator=(type const &) = delete
-
-// A macro to disallow copy constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const&) = delete;                 \
-  type& operator=(type const&) = delete
-
-// A macro to disallow move operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_ASSIGN_(type) \
-  type& operator=(type &&) noexcept = delete
-
-// A macro to disallow move constructor and operator=
-// This should be used in the private: declarations for a class.
-#define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type&&) noexcept = delete;             \
-  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -715,9 +684,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 //
 //   Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_;
 #if defined(__GNUC__) && !defined(COMPILER_ICC)
-# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result))
+#define GTEST_MUST_USE_RESULT_ __attribute__((warn_unused_result))
 #else
-# define GTEST_MUST_USE_RESULT_
+#define GTEST_MUST_USE_RESULT_
 #endif  // __GNUC__ && !COMPILER_ICC
 
 // MS C++ compiler emits warning when a conditional expression is compile time
@@ -728,10 +697,9 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 // while (true) {
 // GTEST_INTENTIONAL_CONST_COND_POP_()
 // }
-# define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
-    GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
-# define GTEST_INTENTIONAL_CONST_COND_POP_() \
-    GTEST_DISABLE_MSC_WARNINGS_POP_()
+#define GTEST_INTENTIONAL_CONST_COND_PUSH_() \
+  GTEST_DISABLE_MSC_WARNINGS_PUSH_(4127)
+#define GTEST_INTENTIONAL_CONST_COND_POP_() GTEST_DISABLE_MSC_WARNINGS_POP_()
 
 // Determine whether the compiler supports Microsoft's Structured Exception
 // Handling.  This is supported by several Windows compilers but generally
@@ -739,13 +707,13 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #ifndef GTEST_HAS_SEH
 // The user didn't tell us, so we need to figure it out.
 
-# if defined(_MSC_VER) || defined(__BORLANDC__)
+#if defined(_MSC_VER) || defined(__BORLANDC__)
 // These two compilers are known to support SEH.
-#  define GTEST_HAS_SEH 1
-# else
+#define GTEST_HAS_SEH 1
+#else
 // Assume no SEH.
-#  define GTEST_HAS_SEH 0
-# endif
+#define GTEST_HAS_SEH 0
+#endif
 
 #endif  // GTEST_HAS_SEH
 
@@ -758,94 +726,112 @@ typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 
 #endif  // GTEST_IS_THREADSAFE
 
+#if GTEST_IS_THREADSAFE
+// Some platforms don't support including these threading related headers.
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#endif                         // GTEST_IS_THREADSAFE
+
 // GTEST_API_ qualifies all symbols that must be exported. The definitions below
 // are guarded by #ifndef to give embedders a chance to define GTEST_API_ in
 // gtest/internal/custom/gtest-port.h
 #ifndef GTEST_API_
 
 #ifdef _MSC_VER
-# if GTEST_LINKED_AS_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllimport)
-# elif GTEST_CREATE_SHARED_LIBRARY
-#  define GTEST_API_ __declspec(dllexport)
-# endif
+#if GTEST_LINKED_AS_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllimport)
+#elif GTEST_CREATE_SHARED_LIBRARY
+#define GTEST_API_ __declspec(dllexport)
+#endif
 #elif __GNUC__ >= 4 || defined(__clang__)
-# define GTEST_API_ __attribute__((visibility ("default")))
+#define GTEST_API_ __attribute__((visibility("default")))
 #endif  // _MSC_VER
 
 #endif  // GTEST_API_
 
 #ifndef GTEST_API_
-# define GTEST_API_
+#define GTEST_API_
 #endif  // GTEST_API_
 
 #ifndef GTEST_DEFAULT_DEATH_TEST_STYLE
-# define GTEST_DEFAULT_DEATH_TEST_STYLE  "fast"
+#define GTEST_DEFAULT_DEATH_TEST_STYLE "fast"
 #endif  // GTEST_DEFAULT_DEATH_TEST_STYLE
 
 #ifdef __GNUC__
 // Ask the compiler to never inline a given function.
-# define GTEST_NO_INLINE_ __attribute__((noinline))
+#define GTEST_NO_INLINE_ __attribute__((noinline))
 #else
-# define GTEST_NO_INLINE_
+#define GTEST_NO_INLINE_
+#endif
+
+#if defined(__clang__)
+// Nested ifs to avoid triggering MSVC warning.
+#if __has_attribute(disable_tail_calls)
+// Ask the compiler not to perform tail call optimization inside
+// the marked function.
+#define GTEST_NO_TAIL_CALL_ __attribute__((disable_tail_calls))
+#endif
+#elif __GNUC__
+#define GTEST_NO_TAIL_CALL_ \
+  __attribute__((optimize("no-optimize-sibling-calls")))
+#else
+#define GTEST_NO_TAIL_CALL_
 #endif
 
 // _LIBCPP_VERSION is defined by the libc++ library from the LLVM project.
 #if !defined(GTEST_HAS_CXXABI_H_)
-# if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
-#  define GTEST_HAS_CXXABI_H_ 1
-# else
-#  define GTEST_HAS_CXXABI_H_ 0
-# endif
+#if defined(__GLIBCXX__) || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))
+#define GTEST_HAS_CXXABI_H_ 1
+#else
+#define GTEST_HAS_CXXABI_H_ 0
+#endif
 #endif
 
 // A function level attribute to disable checking for use of uninitialized
 // memory when built with MemorySanitizer.
 #if defined(__clang__)
-# if __has_feature(memory_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ \
-       __attribute__((no_sanitize_memory))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-# endif  // __has_feature(memory_sanitizer)
+#if __has_feature(memory_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ __attribute__((no_sanitize_memory))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#endif  // __has_feature(memory_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
 #endif  // __clang__
 
 // A function level attribute to disable AddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(address_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
-       __attribute__((no_sanitize_address))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-# endif  // __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_ \
+  __attribute__((no_sanitize_address))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#endif  // __has_feature(address_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable HWAddressSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(hwaddress_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
-       __attribute__((no_sanitize("hwaddress")))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-# endif  // __has_feature(hwaddress_sanitizer)
+#if __has_feature(hwaddress_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_ \
+  __attribute__((no_sanitize("hwaddress")))
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+#endif  // __has_feature(hwaddress_sanitizer)
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 #endif  // __clang__
 
 // A function level attribute to disable ThreadSanitizer instrumentation.
 #if defined(__clang__)
-# if __has_feature(thread_sanitizer)
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ \
-       __attribute__((no_sanitize_thread))
-# else
-#  define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-# endif  // __has_feature(thread_sanitizer)
+#if __has_feature(thread_sanitizer)
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ __attribute__((no_sanitize_thread))
+#else
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#endif  // __has_feature(thread_sanitizer)
 #else
-# define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
+#define GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 #endif  // __clang__
 
 namespace testing {
@@ -867,25 +853,37 @@ namespace internal {
 // Secret object, which is what we want.
 class Secret;
 
-// The GTEST_COMPILE_ASSERT_ is a legacy macro used to verify that a compile
-// time expression is true (in new code, use static_assert instead). For
-// example, you could use it to verify the size of a static array:
-//
-//   GTEST_COMPILE_ASSERT_(GTEST_ARRAY_SIZE_(names) == NUM_NAMES,
-//                         names_incorrect_size);
-//
-// The second argument to the macro must be a valid C++ identifier. If the
-// expression is false, compiler will issue an error containing this identifier.
-#define GTEST_COMPILE_ASSERT_(expr, msg) static_assert(expr, #msg)
-
 // A helper for suppressing warnings on constant condition.  It just
 // returns 'condition'.
 GTEST_API_ bool IsTrue(bool condition);
 
 // Defines RE.
 
-#if GTEST_USES_PCRE
-// if used, PCRE is injected by custom/gtest-port.h
+#if GTEST_USES_RE2
+
+// This is almost `using RE = ::RE2`, except it is copy-constructible, and it
+// needs to disambiguate the `std::string`, `absl::string_view`, and `const
+// char*` constructors.
+class GTEST_API_ RE {
+ public:
+  RE(absl::string_view regex) : regex_(regex) {}                  // NOLINT
+  RE(const char* regex) : RE(absl::string_view(regex)) {}         // NOLINT
+  RE(const std::string& regex) : RE(absl::string_view(regex)) {}  // NOLINT
+  RE(const RE& other) : RE(other.pattern()) {}
+
+  const std::string& pattern() const { return regex_.pattern(); }
+
+  static bool FullMatch(absl::string_view str, const RE& re) {
+    return RE2::FullMatch(str, re.regex_);
+  }
+  static bool PartialMatch(absl::string_view str, const RE& re) {
+    return RE2::PartialMatch(str, re.regex_);
+  }
+
+ private:
+  RE2 regex_;
+};
+
 #elif GTEST_USES_POSIX_RE || GTEST_USES_SIMPLE_RE
 
 // A simple C++ wrapper for <regex.h>.  It uses the POSIX Extended
@@ -924,19 +922,19 @@ class GTEST_API_ RE {
   const char* pattern_;
   bool is_valid_;
 
-# if GTEST_USES_POSIX_RE
+#if GTEST_USES_POSIX_RE
 
   regex_t full_regex_;     // For FullMatch().
   regex_t partial_regex_;  // For PartialMatch().
 
-# else  // GTEST_USES_SIMPLE_RE
+#else  // GTEST_USES_SIMPLE_RE
 
   const char* full_pattern_;  // For FullMatch();
 
-# endif
+#endif
 };
 
-#endif  // GTEST_USES_PCRE
+#endif  // ::testing::internal::RE implementation
 
 // Formats a source file path and a line number as they would appear
 // in an error message from the compiler used to compile this code.
@@ -954,12 +952,7 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
 //   LogToStderr()  - directs all log messages to stderr.
 //   FlushInfoLog() - flushes informational log messages.
 
-enum GTestLogSeverity {
-  GTEST_INFO,
-  GTEST_WARNING,
-  GTEST_ERROR,
-  GTEST_FATAL
-};
+enum GTestLogSeverity { GTEST_INFO, GTEST_WARNING, GTEST_ERROR, GTEST_FATAL };
 
 // Formats log entry severity, provides a stream object for streaming the
 // log message, and terminates the message with a newline when going out of
@@ -976,14 +969,16 @@ class GTEST_API_ GTestLog {
  private:
   const GTestLogSeverity severity_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog);
+  GTestLog(const GTestLog&) = delete;
+  GTestLog& operator=(const GTestLog&) = delete;
 };
 
 #if !defined(GTEST_LOG_)
 
-# define GTEST_LOG_(severity) \
-    ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
-                                  __FILE__, __LINE__).GetStream()
+#define GTEST_LOG_(severity)                                           \
+  ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \
+                                __FILE__, __LINE__)                    \
+      .GetStream()
 
 inline void LogToStderr() {}
 inline void FlushInfoLog() { fflush(nullptr); }
@@ -995,7 +990,7 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //
 // GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition
 // is not satisfied.
-//  Synopsys:
+//  Synopsis:
 //    GTEST_CHECK_(boolean_condition);
 //     or
 //    GTEST_CHECK_(boolean_condition) << "Additional message";
@@ -1005,12 +1000,12 @@ inline void FlushInfoLog() { fflush(nullptr); }
 //    condition itself, plus additional message streamed into it, if any,
 //    and then it aborts the program. It aborts the program irrespective of
 //    whether it is built in the debug mode or not.
-# define GTEST_CHECK_(condition) \
-    GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-    if (::testing::internal::IsTrue(condition)) \
-      ; \
-    else \
-      GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
+#define GTEST_CHECK_(condition)               \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_               \
+  if (::testing::internal::IsTrue(condition)) \
+    ;                                         \
+  else                                        \
+    GTEST_LOG_(FATAL) << "Condition " #condition " failed. "
 #endif  // !defined(GTEST_CHECK_)
 
 // An all-mode assert to verify that the given POSIX-style function
@@ -1019,9 +1014,8 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // in {} if you need to use it as the only statement in an 'if'
 // branch.
 #define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \
-  if (const int gtest_error = (posix_call)) \
-    GTEST_LOG_(FATAL) << #posix_call << "failed with error " \
-                      << gtest_error
+  if (const int gtest_error = (posix_call))    \
+  GTEST_LOG_(FATAL) << #posix_call << "failed with error " << gtest_error
 
 // Transforms "T" into "const T&" according to standard reference collapsing
 // rules (this is only needed as a backport for C++98 compilers that do not
@@ -1035,9 +1029,13 @@ inline void FlushInfoLog() { fflush(nullptr); }
 // Note that the non-const reference will not have "const" added. This is
 // standard, and necessary so that "T" can always bind to "const T&".
 template <typename T>
-struct ConstRef { typedef const T& type; };
+struct ConstRef {
+  typedef const T& type;
+};
 template <typename T>
-struct ConstRef<T&> { typedef T& type; };
+struct ConstRef<T&> {
+  typedef T& type;
+};
 
 // The argument T must depend on some template parameters.
 #define GTEST_REFERENCE_TO_CONST_(T) \
@@ -1050,7 +1048,7 @@ struct ConstRef<T&> { typedef T& type; };
 // const Foo*).  When you use ImplicitCast_, the compiler checks that
 // the cast is safe.  Such explicit ImplicitCast_s are necessary in
 // surprisingly many situations where C++ demands an exact type match
-// instead of an argument type convertable to a target type.
+// instead of an argument type convertible to a target type.
 //
 // The syntax for using ImplicitCast_ is the same as for static_cast:
 //
@@ -1063,8 +1061,10 @@ struct ConstRef<T&> { typedef T& type; };
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., implicit_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To>
-inline To ImplicitCast_(To x) { return x; }
+template <typename To>
+inline To ImplicitCast_(To x) {
+  return x;
+}
 
 // When you upcast (that is, cast a pointer from type Foo to type
 // SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts
@@ -1087,17 +1087,17 @@ inline To ImplicitCast_(To x) { return x; }
 // This relatively ugly name is intentional. It prevents clashes with
 // similar functions users may have (e.g., down_cast). The internal
 // namespace alone is not enough because the function can be found by ADL.
-template<typename To, typename From>  // use like this: DownCast_<T*>(foo);
-inline To DownCast_(From* f) {  // so we only accept pointers
+template <typename To, typename From>  // use like this: DownCast_<T*>(foo);
+inline To DownCast_(From* f) {         // so we only accept pointers
   // Ensures that To is a sub-type of From *.  This test is here only
   // for compile-time type checking, and has no overhead in an
   // optimized build at run-time, as it will be optimized away
   // completely.
   GTEST_INTENTIONAL_CONST_COND_PUSH_()
   if (false) {
-  GTEST_INTENTIONAL_CONST_COND_POP_()
-  const To to = nullptr;
-  ::testing::internal::ImplicitCast_<From*>(to);
+    GTEST_INTENTIONAL_CONST_COND_POP_()
+    const To to = nullptr;
+    ::testing::internal::ImplicitCast_<From*>(to);
   }
 
 #if GTEST_HAS_RTTI
@@ -1162,71 +1162,8 @@ void ClearInjectableArgvs();
 
 // Defines synchronization primitives.
 #if GTEST_IS_THREADSAFE
-# if GTEST_HAS_PTHREAD
-// Sleeps for (roughly) n milliseconds.  This function is only for testing
-// Google Test's own constructs.  Don't use it in user tests, either
-// directly or indirectly.
-inline void SleepMilliseconds(int n) {
-  const timespec time = {
-    0,                  // 0 seconds.
-    n * 1000L * 1000L,  // And n ms.
-  };
-  nanosleep(&time, nullptr);
-}
-# endif  // GTEST_HAS_PTHREAD
-
-# if GTEST_HAS_NOTIFICATION_
-// Notification has already been imported into the namespace.
-// Nothing to do here.
-
-# elif GTEST_HAS_PTHREAD
-// Allows a controller thread to pause execution of newly created
-// threads until notified.  Instances of this class must be created
-// and destroyed in the controller thread.
-//
-// This class is only for testing Google Test's own constructs. Do not
-// use it in user tests, either directly or indirectly.
-class Notification {
- public:
-  Notification() : notified_(false) {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
-  }
-  ~Notification() {
-    pthread_mutex_destroy(&mutex_);
-  }
-
-  // Notifies all threads created with this notification to start. Must
-  // be called from the controller thread.
-  void Notify() {
-    pthread_mutex_lock(&mutex_);
-    notified_ = true;
-    pthread_mutex_unlock(&mutex_);
-  }
-
-  // Blocks until the controller thread notifies. Must be called from a test
-  // thread.
-  void WaitForNotification() {
-    for (;;) {
-      pthread_mutex_lock(&mutex_);
-      const bool notified = notified_;
-      pthread_mutex_unlock(&mutex_);
-      if (notified)
-        break;
-      SleepMilliseconds(10);
-    }
-  }
-
- private:
-  pthread_mutex_t mutex_;
-  bool notified_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
-};
-
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
-
-GTEST_API_ void SleepMilliseconds(int n);
 
+#if GTEST_OS_WINDOWS
 // Provides leak-safe Windows kernel handle ownership.
 // Used in death tests and in threading support.
 class GTEST_API_ AutoHandle {
@@ -1253,8 +1190,18 @@ class GTEST_API_ AutoHandle {
 
   Handle handle_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle);
+  AutoHandle(const AutoHandle&) = delete;
+  AutoHandle& operator=(const AutoHandle&) = delete;
 };
+#endif
+
+#if GTEST_HAS_NOTIFICATION_
+// Notification has already been imported into the namespace.
+// Nothing to do here.
+
+#else
+GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
+/* class A needs to have dll-interface to be used by clients of class B */)
 
 // Allows a controller thread to pause execution of newly created
 // threads until notified.  Instances of this class must be created
@@ -1262,23 +1209,40 @@ class GTEST_API_ AutoHandle {
 //
 // This class is only for testing Google Test's own constructs. Do not
 // use it in user tests, either directly or indirectly.
+// TODO(b/203539622): Replace unconditionally with absl::Notification.
 class GTEST_API_ Notification {
  public:
-  Notification();
-  void Notify();
-  void WaitForNotification();
+  Notification() : notified_(false) {}
+  Notification(const Notification&) = delete;
+  Notification& operator=(const Notification&) = delete;
 
- private:
-  AutoHandle event_;
+  // Notifies all threads created with this notification to start. Must
+  // be called from the controller thread.
+  void Notify() {
+    std::lock_guard<std::mutex> lock(mu_);
+    notified_ = true;
+    cv_.notify_all();
+  }
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification);
+  // Blocks until the controller thread notifies. Must be called from a test
+  // thread.
+  void WaitForNotification() {
+    std::unique_lock<std::mutex> lock(mu_);
+    cv_.wait(lock, [this]() { return notified_; });
+  }
+
+ private:
+  std::mutex mu_;
+  std::condition_variable cv_;
+  bool notified_;
 };
-# endif  // GTEST_HAS_NOTIFICATION_
+GTEST_DISABLE_MSC_WARNINGS_POP_()  // 4251
+#endif  // GTEST_HAS_NOTIFICATION_
 
 // On MinGW, we can have both GTEST_OS_WINDOWS and GTEST_HAS_PTHREAD
 // defined, but we don't want to use MinGW's pthreads implementation, which
 // has conformance problems with some versions of the POSIX standard.
-# if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_HAS_PTHREAD && !GTEST_OS_WINDOWS_MINGW
 
 // As a C-function, ThreadFuncWithCLinkage cannot be templated itself.
 // Consequently, it cannot select a correct instantiation of ThreadWithParam
@@ -1354,16 +1318,17 @@ class ThreadWithParam : public ThreadWithParamBase {
                    // finished.
   pthread_t thread_;  // The native thread object.
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
-# endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
-         // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // !GTEST_OS_WINDOWS && GTEST_HAS_PTHREAD ||
+        // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
-# if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#if GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 // Mutex and ThreadLocal have already been imported into the namespace.
 // Nothing to do here.
 
-# elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#elif GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
 
 // Mutex implements mutex on Windows platforms.  It is used in conjunction
 // with class MutexLock:
@@ -1417,14 +1382,15 @@ class GTEST_API_ Mutex {
   long critical_section_init_phase_;  // NOLINT
   GTEST_CRITICAL_SECTION* critical_section_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-    extern ::testing::internal::Mutex mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
-    ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) \
+  ::testing::internal::Mutex mutex(::testing::internal::Mutex::kStaticMutex)
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1433,15 +1399,15 @@ class GTEST_API_ Mutex {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(Mutex* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(Mutex* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   Mutex* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1468,7 +1434,8 @@ class ThreadLocalBase {
   virtual ~ThreadLocalBase() {}
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocalBase);
+  ThreadLocalBase(const ThreadLocalBase&) = delete;
+  ThreadLocalBase& operator=(const ThreadLocalBase&) = delete;
 };
 
 // Maps a thread to a set of ThreadLocals that have values instantiated on that
@@ -1497,7 +1464,7 @@ class GTEST_API_ ThreadWithParamBase {
     virtual void Run() = 0;
   };
 
-  ThreadWithParamBase(Runnable *runnable, Notification* thread_can_start);
+  ThreadWithParamBase(Runnable* runnable, Notification* thread_can_start);
   virtual ~ThreadWithParamBase();
 
  private:
@@ -1511,30 +1478,26 @@ class ThreadWithParam : public ThreadWithParamBase {
   typedef void UserThreadFunc(T);
 
   ThreadWithParam(UserThreadFunc* func, T param, Notification* thread_can_start)
-      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {
-  }
+      : ThreadWithParamBase(new RunnableImpl(func, param), thread_can_start) {}
   virtual ~ThreadWithParam() {}
 
  private:
   class RunnableImpl : public Runnable {
    public:
-    RunnableImpl(UserThreadFunc* func, T param)
-        : func_(func),
-          param_(param) {
-    }
+    RunnableImpl(UserThreadFunc* func, T param) : func_(func), param_(param) {}
     virtual ~RunnableImpl() {}
-    virtual void Run() {
-      func_(param_);
-    }
+    virtual void Run() { func_(param_); }
 
    private:
     UserThreadFunc* const func_;
     const T param_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(RunnableImpl);
+    RunnableImpl(const RunnableImpl&) = delete;
+    RunnableImpl& operator=(const RunnableImpl&) = delete;
   };
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam);
+  ThreadWithParam(const ThreadWithParam&) = delete;
+  ThreadWithParam& operator=(const ThreadWithParam&) = delete;
 };
 
 // Implements thread-local storage on Windows systems.
@@ -1571,7 +1534,7 @@ class ThreadLocal : public ThreadLocalBase {
   explicit ThreadLocal(const T& value)
       : default_factory_(new InstanceValueHolderFactory(value)) {}
 
-  ~ThreadLocal() { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
+  ~ThreadLocal() override { ThreadLocalRegistry::OnThreadLocalDestroyed(this); }
 
   T* pointer() { return GetOrCreateValue(); }
   const T* pointer() const { return GetOrCreateValue(); }
@@ -1590,16 +1553,17 @@ class ThreadLocal : public ThreadLocalBase {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
-
   T* GetOrCreateValue() const {
     return static_cast<ValueHolder*>(
-        ThreadLocalRegistry::GetValueOnCurrentThread(this))->pointer();
+               ThreadLocalRegistry::GetValueOnCurrentThread(this))
+        ->pointer();
   }
 
-  virtual ThreadLocalValueHolderBase* NewValueForCurrentThread() const {
+  ThreadLocalValueHolderBase* NewValueForCurrentThread() const override {
     return default_factory_->MakeNewHolder();
   }
 
@@ -1610,7 +1574,8 @@ class ThreadLocal : public ThreadLocalBase {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1619,7 +1584,9 @@ class ThreadLocal : public ThreadLocalBase {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1632,15 +1599,18 @@ class ThreadLocal : public ThreadLocalBase {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# elif GTEST_HAS_PTHREAD
+#elif GTEST_HAS_PTHREAD
 
 // MutexBase and Mutex implement mutex on pthreads-based platforms.
 class MutexBase {
@@ -1687,8 +1657,8 @@ class MutexBase {
 };
 
 // Forward-declares a static mutex.
-#  define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
-     extern ::testing::internal::MutexBase mutex
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+  extern ::testing::internal::MutexBase mutex
 
 // Defines and statically (i.e. at link time) initializes a static mutex.
 // The initialization list here does not explicitly initialize each field,
@@ -1707,12 +1677,11 @@ class Mutex : public MutexBase {
     GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, nullptr));
     has_owner_ = false;
   }
-  ~Mutex() {
-    GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_));
-  }
+  ~Mutex() { GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); }
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex);
+  Mutex(const Mutex&) = delete;
+  Mutex& operator=(const Mutex&) = delete;
 };
 
 // We cannot name this class MutexLock because the ctor declaration would
@@ -1722,15 +1691,15 @@ class Mutex : public MutexBase {
 // "MutexLock l(&mu)".  Hence the typedef trick below.
 class GTestMutexLock {
  public:
-  explicit GTestMutexLock(MutexBase* mutex)
-      : mutex_(mutex) { mutex_->Lock(); }
+  explicit GTestMutexLock(MutexBase* mutex) : mutex_(mutex) { mutex_->Lock(); }
 
   ~GTestMutexLock() { mutex_->Unlock(); }
 
  private:
   MutexBase* const mutex_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock);
+  GTestMutexLock(const GTestMutexLock&) = delete;
+  GTestMutexLock& operator=(const GTestMutexLock&) = delete;
 };
 
 typedef GTestMutexLock MutexLock;
@@ -1787,7 +1756,8 @@ class GTEST_API_ ThreadLocal {
 
    private:
     T value_;
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder);
+    ValueHolder(const ValueHolder&) = delete;
+    ValueHolder& operator=(const ValueHolder&) = delete;
   };
 
   static pthread_key_t CreateKey() {
@@ -1819,7 +1789,8 @@ class GTEST_API_ ThreadLocal {
     virtual ValueHolder* MakeNewHolder() const = 0;
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolderFactory);
+    ValueHolderFactory(const ValueHolderFactory&) = delete;
+    ValueHolderFactory& operator=(const ValueHolderFactory&) = delete;
   };
 
   class DefaultValueHolderFactory : public ValueHolderFactory {
@@ -1828,7 +1799,9 @@ class GTEST_API_ ThreadLocal {
     ValueHolder* MakeNewHolder() const override { return new ValueHolder(); }
 
    private:
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultValueHolderFactory);
+    DefaultValueHolderFactory(const DefaultValueHolderFactory&) = delete;
+    DefaultValueHolderFactory& operator=(const DefaultValueHolderFactory&) =
+        delete;
   };
 
   class InstanceValueHolderFactory : public ValueHolderFactory {
@@ -1841,17 +1814,20 @@ class GTEST_API_ ThreadLocal {
    private:
     const T value_;  // The value for each thread.
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(InstanceValueHolderFactory);
+    InstanceValueHolderFactory(const InstanceValueHolderFactory&) = delete;
+    InstanceValueHolderFactory& operator=(const InstanceValueHolderFactory&) =
+        delete;
   };
 
   // A key pthreads uses for looking up per-thread values.
   const pthread_key_t key_;
   std::unique_ptr<ValueHolderFactory> default_factory_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal);
+  ThreadLocal(const ThreadLocal&) = delete;
+  ThreadLocal& operator=(const ThreadLocal&) = delete;
 };
 
-# endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
+#endif  // GTEST_HAS_MUTEX_AND_THREAD_LOCAL_
 
 #else  // GTEST_IS_THREADSAFE
 
@@ -1868,10 +1844,10 @@ class Mutex {
   void AssertHeld() const {}
 };
 
-# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
+#define GTEST_DECLARE_STATIC_MUTEX_(mutex) \
   extern ::testing::internal::Mutex mutex
 
-# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
+#define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex
 
 // We cannot name this class MutexLock because the ctor declaration would
 // conflict with a macro named MutexLock, which is defined on some
@@ -1894,6 +1870,7 @@ class GTEST_API_ ThreadLocal {
   const T* pointer() const { return &value_; }
   const T& get() const { return value_; }
   void set(const T& value) { value_ = value; }
+
  private:
   T value_;
 };
@@ -1905,11 +1882,11 @@ class GTEST_API_ ThreadLocal {
 GTEST_API_ size_t GetThreadCount();
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_SEP_ "\\"
-# define GTEST_HAS_ALT_PATH_SEP_ 1
+#define GTEST_PATH_SEP_ "\\"
+#define GTEST_HAS_ALT_PATH_SEP_ 1
 #else
-# define GTEST_PATH_SEP_ "/"
-# define GTEST_HAS_ALT_PATH_SEP_ 0
+#define GTEST_PATH_SEP_ "/"
+#define GTEST_HAS_ALT_PATH_SEP_ 0
 #endif  // GTEST_OS_WINDOWS
 
 // Utilities for char.
@@ -1967,8 +1944,7 @@ inline char ToUpper(char ch) {
 
 inline std::string StripTrailingSpaces(std::string str) {
   std::string::iterator it = str.end();
-  while (it != str.begin() && IsSpace(*--it))
-    it = str.erase(it);
+  while (it != str.begin() && IsSpace(*--it)) it = str.erase(it);
   return str;
 }
 
@@ -1986,36 +1962,35 @@ namespace posix {
 
 typedef struct _stat StatStruct;
 
-# ifdef __BORLANDC__
+#ifdef __BORLANDC__
 inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
-# else  // !__BORLANDC__
-#  if GTEST_OS_WINDOWS_MOBILE
+#else  // !__BORLANDC__
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
+    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
 inline int DoIsATTY(int /* fd */) { return 0; }
-#  else
+#else
 inline int DoIsATTY(int fd) { return _isatty(fd); }
-#  endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return _strdup(src); }
-# endif  // __BORLANDC__
+#endif  // __BORLANDC__
 
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 inline int FileNo(FILE* file) { return reinterpret_cast<int>(_fileno(file)); }
 // Stat(), RmDir(), and IsDir() are not needed on Windows CE at this
 // time and thus not defined there.
-# else
+#else
 inline int FileNo(FILE* file) { return _fileno(file); }
 inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); }
 inline int RmDir(const char* dir) { return _rmdir(dir); }
-inline bool IsDir(const StatStruct& st) {
-  return (_S_IFDIR & st.st_mode) != 0;
-}
-# endif  // GTEST_OS_WINDOWS_MOBILE
+inline bool IsDir(const StatStruct& st) { return (_S_IFDIR & st.st_mode) != 0; }
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
 #elif GTEST_OS_ESP8266
 typedef struct stat StatStruct;
@@ -2079,12 +2054,12 @@ inline FILE* FOpen(const char* path, const char* mode) {
   std::wstring wide_path = converter.from_bytes(path);
   std::wstring wide_mode = converter.from_bytes(mode);
   return _wfopen(wide_path.c_str(), wide_mode.c_str());
-#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+#else   // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
 #endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
-inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
+inline FILE* FReopen(const char* path, const char* mode, FILE* stream) {
   return freopen(path, mode, stream);
 }
 inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); }
@@ -2136,13 +2111,13 @@ GTEST_DISABLE_MSC_DEPRECATED_POP_()
 // snprintf is a variadic function.
 #if _MSC_VER && !GTEST_OS_WINDOWS_MOBILE
 // MSVC 2005 and above support variadic macros.
-# define GTEST_SNPRINTF_(buffer, size, format, ...) \
-     _snprintf_s(buffer, size, size, format, __VA_ARGS__)
+#define GTEST_SNPRINTF_(buffer, size, format, ...) \
+  _snprintf_s(buffer, size, size, format, __VA_ARGS__)
 #elif defined(_MSC_VER)
 // Windows CE does not define _snprintf_s
-# define GTEST_SNPRINTF_ _snprintf
+#define GTEST_SNPRINTF_ _snprintf
 #else
-# define GTEST_SNPRINTF_ snprintf
+#define GTEST_SNPRINTF_ snprintf
 #endif
 
 // The biggest signed integer type the compiler supports.
@@ -2202,37 +2177,84 @@ using TimeInMillis = int64_t;  // Represents time in milliseconds.
 
 // Macro for referencing flags.
 #if !defined(GTEST_FLAG)
-# define GTEST_FLAG(name) FLAGS_gtest_##name
+#define GTEST_FLAG_NAME_(name) gtest_##name
+#define GTEST_FLAG(name) FLAGS_gtest_##name
 #endif  // !defined(GTEST_FLAG)
 
-#if !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
-# define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
-#endif  // !defined(GTEST_USE_OWN_FLAGFILE_FLAG_)
+// Pick a command line flags implementation.
+#if GTEST_HAS_ABSL
 
-#if !defined(GTEST_DECLARE_bool_)
-# define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+// Macros for defining flags.
+#define GTEST_DEFINE_bool_(name, default_val, doc) \
+  ABSL_FLAG(bool, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_int32_(name, default_val, doc) \
+  ABSL_FLAG(int32_t, GTEST_FLAG_NAME_(name), default_val, doc)
+#define GTEST_DEFINE_string_(name, default_val, doc) \
+  ABSL_FLAG(std::string, GTEST_FLAG_NAME_(name), default_val, doc)
 
 // Macros for declaring flags.
-# define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name)
-# define GTEST_DECLARE_int32_(name) \
-    GTEST_API_ extern std::int32_t GTEST_FLAG(name)
-# define GTEST_DECLARE_string_(name) \
-    GTEST_API_ extern ::std::string GTEST_FLAG(name)
+#define GTEST_DECLARE_bool_(name) \
+  ABSL_DECLARE_FLAG(bool, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_int32_(name) \
+  ABSL_DECLARE_FLAG(int32_t, GTEST_FLAG_NAME_(name))
+#define GTEST_DECLARE_string_(name) \
+  ABSL_DECLARE_FLAG(std::string, GTEST_FLAG_NAME_(name))
+
+#define GTEST_FLAG_SAVER_ ::absl::FlagSaver
+
+#define GTEST_FLAG_GET(name) ::absl::GetFlag(GTEST_FLAG(name))
+#define GTEST_FLAG_SET(name, value) \
+  (void)(::absl::SetFlag(&GTEST_FLAG(name), value))
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 0
+
+#else  // GTEST_HAS_ABSL
 
 // Macros for defining flags.
-# define GTEST_DEFINE_bool_(name, default_val, doc) \
-    GTEST_API_ bool GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_int32_(name, default_val, doc) \
-    GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val)
-# define GTEST_DEFINE_string_(name, default_val, doc) \
-    GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val)
+#define GTEST_DEFINE_bool_(name, default_val, doc)  \
+  namespace testing {                               \
+  GTEST_API_ bool GTEST_FLAG(name) = (default_val); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_int32_(name, default_val, doc)         \
+  namespace testing {                                       \
+  GTEST_API_ std::int32_t GTEST_FLAG(name) = (default_val); \
+  }                                                         \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DEFINE_string_(name, default_val, doc)         \
+  namespace testing {                                        \
+  GTEST_API_ ::std::string GTEST_FLAG(name) = (default_val); \
+  }                                                          \
+  static_assert(true, "no-op to require trailing semicolon")
 
-#endif  // !defined(GTEST_DECLARE_bool_)
+// Macros for declaring flags.
+#define GTEST_DECLARE_bool_(name)          \
+  namespace testing {                      \
+  GTEST_API_ extern bool GTEST_FLAG(name); \
+  }                                        \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_int32_(name)                 \
+  namespace testing {                              \
+  GTEST_API_ extern std::int32_t GTEST_FLAG(name); \
+  }                                                \
+  static_assert(true, "no-op to require trailing semicolon")
+#define GTEST_DECLARE_string_(name)                 \
+  namespace testing {                               \
+  GTEST_API_ extern ::std::string GTEST_FLAG(name); \
+  }                                                 \
+  static_assert(true, "no-op to require trailing semicolon")
+
+#define GTEST_FLAG_SAVER_ ::testing::internal::GTestFlagSaver
+
+#define GTEST_FLAG_GET(name) ::testing::GTEST_FLAG(name)
+#define GTEST_FLAG_SET(name, value) (void)(::testing::GTEST_FLAG(name) = value)
+#define GTEST_USE_OWN_FLAGFILE_FLAG_ 1
+
+#endif  // GTEST_HAS_ABSL
 
 // Thread annotations
 #if !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
-# define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
-# define GTEST_LOCK_EXCLUDED_(locks)
+#define GTEST_EXCLUSIVE_LOCK_REQUIRED_(locks)
+#define GTEST_LOCK_EXCLUDED_(locks)
 #endif  // !defined(GTEST_EXCLUSIVE_LOCK_REQUIRED_)
 
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
@@ -2308,6 +2330,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::absl::optional<T>;
+inline ::absl::nullopt_t Nullopt() { return ::absl::nullopt; }
 }  // namespace internal
 }  // namespace testing
 #else
@@ -2321,6 +2344,7 @@ namespace testing {
 namespace internal {
 template <typename T>
 using Optional = ::std::optional<T>;
+inline ::std::nullopt_t Nullopt() { return ::std::nullopt; }
 }  // namespace internal
 }  // namespace testing
 // The case where absl is configured NOT to alias std::optional is not
@@ -2332,7 +2356,7 @@ using Optional = ::std::optional<T>;
 #if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
-# define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include "absl/strings/string_view.h"
 namespace testing {
 namespace internal {
@@ -2340,11 +2364,11 @@ using StringView = ::absl::string_view;
 }  // namespace internal
 }  // namespace testing
 #else
-# ifdef __has_include
-#   if __has_include(<string_view>) && __cplusplus >= 201703L
+#ifdef __has_include
+#if __has_include(<string_view>) && __cplusplus >= 201703L
 // Otherwise for C++17 and higher use std::string_view for Matcher<>
 // specializations.
-#   define GTEST_INTERNAL_HAS_STRING_VIEW 1
+#define GTEST_INTERNAL_HAS_STRING_VIEW 1
 #include <string_view>
 namespace testing {
 namespace internal {
@@ -2353,8 +2377,8 @@ using StringView = ::std::string_view;
 }  // namespace testing
 // The case where absl is configured NOT to alias std::string_view is not
 // supported.
-#  endif  // __has_include(<string_view>) && __cplusplus >= 201703L
-# endif  // __has_include
+#endif  // __has_include(<string_view>) && __cplusplus >= 201703L
+#endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
 #if GTEST_HAS_ABSL
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 10f774f96..cca2e1f2a 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -26,7 +26,7 @@
 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//
+
 // The Google C++ Testing and Mocking Framework (Google Test)
 //
 // This header file declares the String class and functions used internally by
@@ -36,17 +36,20 @@
 // This header file is #included by gtest-internal.h.
 // It should not be #included by other files.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
-# include <mem.h>
+#include <mem.h>
 #endif
 
 #include <string.h>
+
 #include <cstdint>
 #include <string>
 
@@ -123,8 +126,7 @@ class GTEST_API_ String {
   // Unlike strcasecmp(), this function can handle NULL argument(s).
   // A NULL C string is considered different to any non-NULL C string,
   // including the empty string.
-  static bool CaseInsensitiveCStringEquals(const char* lhs,
-                                           const char* rhs);
+  static bool CaseInsensitiveCStringEquals(const char* lhs, const char* rhs);
 
   // Compares two wide C strings, ignoring case.  Returns true if and only if
   // they have the same content.
@@ -143,8 +145,8 @@ class GTEST_API_ String {
 
   // Returns true if and only if the given string ends with the given suffix,
   // ignoring case. Any string is considered to end with an empty suffix.
-  static bool EndsWithCaseInsensitive(
-      const std::string& str, const std::string& suffix);
+  static bool EndsWithCaseInsensitive(const std::string& str,
+                                      const std::string& suffix);
 
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
@@ -163,7 +165,7 @@ class GTEST_API_ String {
 
  private:
   String();  // Not meant to be instantiated.
-};  // class String
+};           // class String
 
 // Gets the content of the stringstream's buffer as an std::string.  Each '\0'
 // character in the buffer is replaced with "\\0".
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index b87a2e2ca..6bc02a7de 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -30,7 +30,9 @@
 // Type utilities needed for implementing typed and type-parameterized
 // tests.
 
-// GOOGLETEST_CM0001 DO NOT DELETE
+// IWYU pragma: private, include "gtest/gtest.h"
+// IWYU pragma: friend gtest/.*
+// IWYU pragma: friend gmock/.*
 
 #ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 #define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
@@ -39,11 +41,11 @@
 
 // #ifdef __GNUC__ is too general here.  It is possible to use gcc without using
 // libstdc++ (which is where cxxabi.h comes from).
-# if GTEST_HAS_CXXABI_H_
-#  include <cxxabi.h>
-# elif defined(__HP_aCC)
-#  include <acxx_demangle.h>
-# endif  // GTEST_HASH_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
+#include <cxxabi.h>
+#elif defined(__HP_aCC)
+#include <acxx_demangle.h>
+#endif  // GTEST_HASH_CXXABI_H_
 
 namespace testing {
 namespace internal {
@@ -101,7 +103,9 @@ std::string GetTypeName() {
 // A unique type indicating an empty node
 struct None {};
 
-# define GTEST_TEMPLATE_ template <typename T> class
+#define GTEST_TEMPLATE_ \
+  template <typename T> \
+  class
 
 // The template "selector" struct TemplateSel<Tmpl> is used to
 // represent Tmpl, which must be a class template with one type
@@ -119,8 +123,7 @@ struct TemplateSel {
   };
 };
 
-# define GTEST_BIND_(TmplSel, T) \
-  TmplSel::template Bind<T>::type
+#define GTEST_BIND_(TmplSel, T) TmplSel::template Bind<T>::type
 
 template <GTEST_TEMPLATE_ Head_, GTEST_TEMPLATE_... Tail_>
 struct Templates {
diff --git a/libvpx/third_party/googletest/src/src/gtest-all.cc b/libvpx/third_party/googletest/src/src/gtest-all.cc
index ad292905c..2a70ed88c 100644
--- a/libvpx/third_party/googletest/src/src/gtest-all.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-all.cc
@@ -38,7 +38,7 @@
 #include "gtest/gtest.h"
 
 // The following lines pull in the real gtest *.cc files.
-#include "src/gtest.cc"
+#include "src/gtest-assertion-result.cc"
 #include "src/gtest-death-test.cc"
 #include "src/gtest-filepath.cc"
 #include "src/gtest-matchers.cc"
@@ -46,3 +46,4 @@
 #include "src/gtest-printers.cc"
 #include "src/gtest-test-part.cc"
 #include "src/gtest-typed-test.cc"
+#include "src/gtest.cc"
diff --git a/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc b/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc
new file mode 100644
index 000000000..f1c0b10dc
--- /dev/null
+++ b/libvpx/third_party/googletest/src/src/gtest-assertion-result.cc
@@ -0,0 +1,77 @@
+// Copyright 2005, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// The Google C++ Testing and Mocking Framework (Google Test)
+//
+// This file defines the AssertionResult type.
+
+#include "gtest/gtest-assertion-result.h"
+
+#include <string>
+#include <utility>
+
+#include "gtest/gtest-message.h"
+
+namespace testing {
+
+// AssertionResult constructors.
+// Used in EXPECT_TRUE/FALSE(assertion_result).
+AssertionResult::AssertionResult(const AssertionResult& other)
+    : success_(other.success_),
+      message_(other.message_.get() != nullptr
+                   ? new ::std::string(*other.message_)
+                   : static_cast< ::std::string*>(nullptr)) {}
+
+// Swaps two AssertionResults.
+void AssertionResult::swap(AssertionResult& other) {
+  using std::swap;
+  swap(success_, other.success_);
+  swap(message_, other.message_);
+}
+
+// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
+AssertionResult AssertionResult::operator!() const {
+  AssertionResult negation(!success_);
+  if (message_.get() != nullptr) negation << *message_;
+  return negation;
+}
+
+// Makes a successful assertion result.
+AssertionResult AssertionSuccess() { return AssertionResult(true); }
+
+// Makes a failed assertion result.
+AssertionResult AssertionFailure() { return AssertionResult(false); }
+
+// Makes a failed assertion result with the given failure message.
+// Deprecated; use AssertionFailure() << message.
+AssertionResult AssertionFailure(const Message& message) {
+  return AssertionFailure() << message;
+}
+
+}  // namespace testing
diff --git a/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libvpx/third_party/googletest/src/src/gtest-death-test.cc
index bf4f6331d..e6abc6278 100644
--- a/libvpx/third_party/googletest/src/src/gtest-death-test.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-death-test.cc
@@ -35,49 +35,49 @@
 #include <functional>
 #include <utility>
 
-#include "gtest/internal/gtest-port.h"
 #include "gtest/internal/custom/gtest.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_HAS_DEATH_TEST
 
-# if GTEST_OS_MAC
-#  include <crt_externs.h>
-# endif  // GTEST_OS_MAC
-
-# include <errno.h>
-# include <fcntl.h>
-# include <limits.h>
-
-# if GTEST_OS_LINUX
-#  include <signal.h>
-# endif  // GTEST_OS_LINUX
-
-# include <stdarg.h>
-
-# if GTEST_OS_WINDOWS
-#  include <windows.h>
-# else
-#  include <sys/mman.h>
-#  include <sys/wait.h>
-# endif  // GTEST_OS_WINDOWS
-
-# if GTEST_OS_QNX
-#  include <spawn.h>
-# endif  // GTEST_OS_QNX
-
-# if GTEST_OS_FUCHSIA
-#  include <lib/fdio/fd.h>
-#  include <lib/fdio/io.h>
-#  include <lib/fdio/spawn.h>
-#  include <lib/zx/channel.h>
-#  include <lib/zx/port.h>
-#  include <lib/zx/process.h>
-#  include <lib/zx/socket.h>
-#  include <zircon/processargs.h>
-#  include <zircon/syscalls.h>
-#  include <zircon/syscalls/policy.h>
-#  include <zircon/syscalls/port.h>
-# endif  // GTEST_OS_FUCHSIA
+#if GTEST_OS_MAC
+#include <crt_externs.h>
+#endif  // GTEST_OS_MAC
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+
+#if GTEST_OS_LINUX
+#include <signal.h>
+#endif  // GTEST_OS_LINUX
+
+#include <stdarg.h>
+
+#if GTEST_OS_WINDOWS
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/wait.h>
+#endif  // GTEST_OS_WINDOWS
+
+#if GTEST_OS_QNX
+#include <spawn.h>
+#endif  // GTEST_OS_QNX
+
+#if GTEST_OS_FUCHSIA
+#include <lib/fdio/fd.h>
+#include <lib/fdio/io.h>
+#include <lib/fdio/spawn.h>
+#include <lib/zx/channel.h>
+#include <lib/zx/port.h>
+#include <lib/zx/process.h>
+#include <lib/zx/socket.h>
+#include <zircon/processargs.h>
+#include <zircon/syscalls.h>
+#include <zircon/syscalls/policy.h>
+#include <zircon/syscalls/port.h>
+#endif  // GTEST_OS_FUCHSIA
 
 #endif  // GTEST_HAS_DEATH_TEST
 
@@ -96,9 +96,12 @@ namespace testing {
 // used internally at Google, is "threadsafe".
 static const char kDefaultDeathTestStyle[] = GTEST_DEFAULT_DEATH_TEST_STYLE;
 
+}  // namespace testing
+
 GTEST_DEFINE_string_(
     death_test_style,
-    internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle),
+    testing::internal::StringFromGTestEnv("death_test_style",
+                                          testing::kDefaultDeathTestStyle),
     "Indicates how to run a death test in a forked child process: "
     "\"threadsafe\" (child process re-executes the test binary "
     "from the beginning, running only the specific death test) or "
@@ -107,7 +110,7 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     death_test_use_fork,
-    internal::BoolFromGTestEnv("death_test_use_fork", false),
+    testing::internal::BoolFromGTestEnv("death_test_use_fork", false),
     "Instructs to use fork()/_exit() instead of clone() in death tests. "
     "Ignored and always uses fork() on POSIX systems where clone() is not "
     "implemented. Useful when running under valgrind or similar tools if "
@@ -117,7 +120,6 @@ GTEST_DEFINE_bool_(
     "work in 99% of the cases. Once valgrind is fixed, this flag will "
     "most likely be removed.");
 
-namespace internal {
 GTEST_DEFINE_string_(
     internal_run_death_test, "",
     "Indicates the file, line number, temporal index of "
@@ -126,7 +128,8 @@ GTEST_DEFINE_string_(
     "the '|' characters.  This flag is specified if and only if the "
     "current process is a sub-process launched for running a thread-safe "
     "death test.  FOR INTERNAL USE ONLY.");
-}  // namespace internal
+
+namespace testing {
 
 #if GTEST_HAS_DEATH_TEST
 
@@ -134,9 +137,9 @@ namespace internal {
 
 // Valid only for fast death tests. Indicates the code is running in the
 // child process of a fast style death test.
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 static bool g_in_fast_death_test_child = false;
-# endif
+#endif
 
 // Returns a Boolean value indicating whether the caller is currently
 // executing in the context of the death test child process.  Tools such as
@@ -144,16 +147,16 @@ static bool g_in_fast_death_test_child = false;
 // tests.  IMPORTANT: This is an internal utility.  Using it may break the
 // implementation of death tests.  User code MUST NOT use it.
 bool InDeathTestChild() {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   // On Windows and Fuchsia, death tests are thread-safe regardless of the value
   // of the death_test_style flag.
-  return !GTEST_FLAG(internal_run_death_test).empty();
+  return !GTEST_FLAG_GET(internal_run_death_test).empty();
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe")
-    return !GTEST_FLAG(internal_run_death_test).empty();
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe")
+    return !GTEST_FLAG_GET(internal_run_death_test).empty();
   else
     return g_in_fast_death_test_child;
 #endif
@@ -162,40 +165,38 @@ bool InDeathTestChild() {
 }  // namespace internal
 
 // ExitedWithCode constructor.
-ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {
-}
+ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) {}
 
 // ExitedWithCode function-call operator.
 bool ExitedWithCode::operator()(int exit_status) const {
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return exit_status == exit_code_;
 
-# else
+#else
 
   return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_;
 
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // KilledBySignal constructor.
-KilledBySignal::KilledBySignal(int signum) : signum_(signum) {
-}
+KilledBySignal::KilledBySignal(int signum) : signum_(signum) {}
 
 // KilledBySignal function-call operator.
 bool KilledBySignal::operator()(int exit_status) const {
-#  if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#if defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   {
     bool result;
     if (GTEST_KILLED_BY_SIGNAL_OVERRIDE_(signum_, exit_status, &result)) {
       return result;
     }
   }
-#  endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
+#endif  // defined(GTEST_KILLED_BY_SIGNAL_OVERRIDE_)
   return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_;
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 namespace internal {
 
@@ -206,23 +207,23 @@ namespace internal {
 static std::string ExitSummary(int exit_code) {
   Message m;
 
-# if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#if GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   m << "Exited with exit status " << exit_code;
 
-# else
+#else
 
   if (WIFEXITED(exit_code)) {
     m << "Exited with exit status " << WEXITSTATUS(exit_code);
   } else if (WIFSIGNALED(exit_code)) {
     m << "Terminated by signal " << WTERMSIG(exit_code);
   }
-#  ifdef WCOREDUMP
+#ifdef WCOREDUMP
   if (WCOREDUMP(exit_code)) {
     m << " (core dumped)";
   }
-#  endif
-# endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
+#endif
+#endif  // GTEST_OS_WINDOWS || GTEST_OS_FUCHSIA
 
   return m.GetString();
 }
@@ -233,7 +234,7 @@ bool ExitedUnsuccessfully(int exit_status) {
   return !ExitedWithCode(0)(exit_status);
 }
 
-# if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#if !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 // Generates a textual failure message when a death test finds more than
 // one thread running, or cannot determine the number of threads, prior
 // to executing the given statement.  It is the responsibility of the
@@ -254,7 +255,7 @@ static std::string DeathTestThreadWarning(size_t thread_count) {
       << " this is the last message you see before your test times out.";
   return msg.GetString();
 }
-# endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
+#endif  // !GTEST_OS_WINDOWS && !GTEST_OS_FUCHSIA
 
 // Flag characters for reporting a death test that did not die.
 static const char kDeathTestLived = 'L';
@@ -304,14 +305,14 @@ static void DeathTestAbort(const std::string& message) {
 
 // A replacement for CHECK that calls DeathTestAbort if the assertion
 // fails.
-# define GTEST_DEATH_TEST_CHECK_(expression) \
-  do { \
-    if (!::testing::internal::IsTrue(expression)) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ +  ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_(expression)                              \
+  do {                                                                   \
+    if (!::testing::internal::IsTrue(expression)) {                      \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression);                                \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for
@@ -321,23 +322,23 @@ static void DeathTestAbort(const std::string& message) {
 // evaluates the expression as long as it evaluates to -1 and sets
 // errno to EINTR.  If the expression evaluates to -1 but errno is
 // something other than EINTR, DeathTestAbort is called.
-# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \
-  do { \
-    int gtest_retval; \
-    do { \
-      gtest_retval = (expression); \
-    } while (gtest_retval == -1 && errno == EINTR); \
-    if (gtest_retval == -1) { \
-      DeathTestAbort( \
-          ::std::string("CHECK failed: File ") + __FILE__ + ", line " \
-          + ::testing::internal::StreamableToString(__LINE__) + ": " \
-          + #expression + " != -1"); \
-    } \
+#define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression)                      \
+  do {                                                                   \
+    int gtest_retval;                                                    \
+    do {                                                                 \
+      gtest_retval = (expression);                                       \
+    } while (gtest_retval == -1 && errno == EINTR);                      \
+    if (gtest_retval == -1) {                                            \
+      DeathTestAbort(::std::string("CHECK failed: File ") + __FILE__ +   \
+                     ", line " +                                         \
+                     ::testing::internal::StreamableToString(__LINE__) + \
+                     ": " + #expression + " != -1");                     \
+    }                                                                    \
   } while (::testing::internal::AlwaysFalse())
 
 // Returns the message describing the last system error in errno.
 std::string GetLastErrnoDescription() {
-    return errno == 0 ? "" : posix::StrError(errno);
+  return errno == 0 ? "" : posix::StrError(errno);
 }
 
 // This is called from a death test parent process to read a failure
@@ -370,8 +371,9 @@ static void FailFromInternalError(int fd) {
 DeathTest::DeathTest() {
   TestInfo* const info = GetUnitTestImpl()->current_test_info();
   if (info == nullptr) {
-    DeathTestAbort("Cannot run a death test outside of a TEST or "
-                   "TEST_F construct");
+    DeathTestAbort(
+        "Cannot run a death test outside of a TEST or "
+        "TEST_F construct");
   }
 }
 
@@ -500,9 +502,7 @@ void DeathTestImpl::ReadAndInterpretStatusByte() {
   set_read_fd(-1);
 }
 
-std::string DeathTestImpl::GetErrorLogs() {
-  return GetCapturedStderr();
-}
+std::string DeathTestImpl::GetErrorLogs() { return GetCapturedStderr(); }
 
 // Signals that the death test code which should have exited, didn't.
 // Should be called only in a death test child process.
@@ -512,9 +512,9 @@ void DeathTestImpl::Abort(AbortReason reason) {
   // The parent process considers the death test to be a failure if
   // it finds any data in our pipe.  So, here we write a single flag byte
   // to the pipe, then exit.
-  const char status_ch =
-      reason == TEST_DID_NOT_DIE ? kDeathTestLived :
-      reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned;
+  const char status_ch = reason == TEST_DID_NOT_DIE       ? kDeathTestLived
+                         : reason == TEST_THREW_EXCEPTION ? kDeathTestThrew
+                                                          : kDeathTestReturned;
 
   GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1));
   // We are leaking the descriptor here because on some platforms (i.e.,
@@ -533,7 +533,7 @@ void DeathTestImpl::Abort(AbortReason reason) {
 // much easier.
 static ::std::string FormatDeathTestOutput(const ::std::string& output) {
   ::std::string ret;
-  for (size_t at = 0; ; ) {
+  for (size_t at = 0;;) {
     const size_t line_end = output.find('\n', at);
     ret += "[  DEATH   ] ";
     if (line_end == ::std::string::npos) {
@@ -568,8 +568,7 @@ static ::std::string FormatDeathTestOutput(const ::std::string& output) {
 // the first failing condition, in the order given above, is the one that is
 // reported. Also sets the last death test message string.
 bool DeathTestImpl::Passed(bool status_ok) {
-  if (!spawned())
-    return false;
+  if (!spawned()) return false;
 
   const std::string error_message = GetErrorLogs();
 
@@ -580,15 +579,18 @@ bool DeathTestImpl::Passed(bool status_ok) {
   switch (outcome()) {
     case LIVED:
       buffer << "    Result: failed to die.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case THREW:
       buffer << "    Result: threw an exception.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case RETURNED:
       buffer << "    Result: illegal return in test statement.\n"
-             << " Error msg:\n" << FormatDeathTestOutput(error_message);
+             << " Error msg:\n"
+             << FormatDeathTestOutput(error_message);
       break;
     case DIED:
       if (status_ok) {
@@ -605,7 +607,8 @@ bool DeathTestImpl::Passed(bool status_ok) {
       } else {
         buffer << "    Result: died but not with expected exit code:\n"
                << "            " << ExitSummary(status()) << "\n"
-               << "Actual msg:\n" << FormatDeathTestOutput(error_message);
+               << "Actual msg:\n"
+               << FormatDeathTestOutput(error_message);
       }
       break;
     case IN_PROGRESS:
@@ -618,7 +621,7 @@ bool DeathTestImpl::Passed(bool status_ok) {
   return success;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // WindowsDeathTest implements death tests on Windows. Due to the
 // specifics of starting new processes on Windows, death tests there are
 // always threadsafe, and Google Test considers the
@@ -679,14 +682,12 @@ class WindowsDeathTest : public DeathTestImpl {
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int WindowsDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Wait until the child either signals that it has acquired the write end
   // of the pipe or it dies.
-  const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() };
-  switch (::WaitForMultipleObjects(2,
-                                   wait_handles,
+  const HANDLE wait_handles[2] = {child_handle_.Get(), event_handle_.Get()};
+  switch (::WaitForMultipleObjects(2, wait_handles,
                                    FALSE,  // Waits for any of the handles.
                                    INFINITE)) {
     case WAIT_OBJECT_0:
@@ -707,9 +708,8 @@ int WindowsDeathTest::Wait() {
   // returns immediately if the child has already exited, regardless of
   // whether previous calls to WaitForMultipleObjects synchronized on this
   // handle or not.
-  GTEST_DEATH_TEST_CHECK_(
-      WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(),
-                                             INFINITE));
+  GTEST_DEATH_TEST_CHECK_(WAIT_OBJECT_0 ==
+                          ::WaitForSingleObject(child_handle_.Get(), INFINITE));
   DWORD status_code;
   GTEST_DEATH_TEST_CHECK_(
       ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE);
@@ -742,12 +742,12 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   SECURITY_ATTRIBUTES handles_are_inheritable = {sizeof(SECURITY_ATTRIBUTES),
                                                  nullptr, TRUE};
   HANDLE read_handle, write_handle;
-  GTEST_DEATH_TEST_CHECK_(
-      ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable,
-                   0)  // Default buffer size.
-      != FALSE);
-  set_read_fd(::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle),
-                                O_RDONLY));
+  GTEST_DEATH_TEST_CHECK_(::CreatePipe(&read_handle, &write_handle,
+                                       &handles_are_inheritable,
+                                       0)  // Default buffer size.
+                          != FALSE);
+  set_read_fd(
+      ::_open_osfhandle(reinterpret_cast<intptr_t>(read_handle), O_RDONLY));
   write_handle_.Reset(write_handle);
   event_handle_.Reset(::CreateEvent(
       &handles_are_inheritable,
@@ -756,27 +756,26 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
       nullptr));  // The even is unnamed.
   GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != nullptr);
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
   const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag +
-      "=" + file_ + "|" + StreamableToString(line_) + "|" +
-      StreamableToString(death_test_index) + "|" +
+      std::string("--") + GTEST_FLAG_PREFIX_ +
+      "internal_run_death_test=" + file_ + "|" + StreamableToString(line_) +
+      "|" + StreamableToString(death_test_index) + "|" +
       StreamableToString(static_cast<unsigned int>(::GetCurrentProcessId())) +
       // size_t has the same width as pointers on both 32-bit and 64-bit
       // Windows platforms.
       // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx.
-      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) +
-      "|" + StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
+      "|" + StreamableToString(reinterpret_cast<size_t>(write_handle)) + "|" +
+      StreamableToString(reinterpret_cast<size_t>(event_handle_.Get()));
 
   char executable_path[_MAX_PATH + 1];  // NOLINT
   GTEST_DEATH_TEST_CHECK_(_MAX_PATH + 1 != ::GetModuleFileNameA(nullptr,
                                                                 executable_path,
                                                                 _MAX_PATH));
 
-  std::string command_line =
-      std::string(::GetCommandLineA()) + " " + filter_flag + " \"" +
-      internal_flag + "\"";
+  std::string command_line = std::string(::GetCommandLineA()) + " " +
+                             filter_flag + " \"" + internal_flag + "\"";
 
   DeathTest::set_last_death_test_message("");
 
@@ -796,8 +795,8 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(
       ::CreateProcessA(
           executable_path, const_cast<char*>(command_line.c_str()),
-          nullptr,  // Retuned process handle is not inheritable.
-          nullptr,  // Retuned thread handle is not inheritable.
+          nullptr,  // Returned process handle is not inheritable.
+          nullptr,  // Returned thread handle is not inheritable.
           TRUE,  // Child inherits all inheritable handles (for write_handle_).
           0x0,   // Default creation flags.
           nullptr,  // Inherit the parent's environment.
@@ -809,7 +808,7 @@ DeathTest::TestRole WindowsDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
 class FuchsiaDeathTest : public DeathTestImpl {
  public:
@@ -855,18 +854,13 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
-  int size() {
-    return static_cast<int>(args_.size()) - 1;
-  }
+  int size() { return static_cast<int>(args_.size()) - 1; }
 
  private:
   std::vector<char*> args_;
@@ -880,8 +874,7 @@ int FuchsiaDeathTest::Wait() {
   const int kSocketKey = 1;
   const int kExceptionKey = 2;
 
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   // Create a port to wait for socket/task/exception events.
   zx_status_t status_zx;
@@ -890,8 +883,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the child process to terminate.
-  status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
+  status_zx =
+      child_process_.wait_async(port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
@@ -900,8 +893,8 @@ int FuchsiaDeathTest::Wait() {
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
-  status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
+  status_zx = exception_channel_.wait_async(port, kExceptionKey,
+                                            ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -931,9 +924,9 @@ int FuchsiaDeathTest::Wait() {
           size_t old_length = captured_stderr_.length();
           size_t bytes_read = 0;
           captured_stderr_.resize(old_length + kBufferSize);
-          status_zx = stderr_socket_.read(
-              0, &captured_stderr_.front() + old_length, kBufferSize,
-              &bytes_read);
+          status_zx =
+              stderr_socket_.read(0, &captured_stderr_.front() + old_length,
+                                  kBufferSize, &bytes_read);
           captured_stderr_.resize(old_length + bytes_read);
         } while (status_zx == ZX_OK);
         if (status_zx == ZX_ERR_PEER_CLOSED) {
@@ -987,13 +980,12 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Build the child process command line.
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|"
-      + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    kInternalRunDeathTestFlag + "=" + file_ +
+                                    "|" + StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index);
   Arguments args;
   args.AddArguments(GetInjectableArgvs());
   args.AddArgument(filter_flag.c_str());
@@ -1016,8 +1008,7 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a socket pair will be used to receive the child process' stderr.
   zx::socket stderr_producer_socket;
-  status =
-      zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
+  status = zx::socket::create(0, &stderr_producer_socket, &stderr_socket_);
   GTEST_DEATH_TEST_CHECK_(status >= 0);
   int stderr_producer_fd = -1;
   status =
@@ -1034,35 +1025,32 @@ DeathTest::TestRole FuchsiaDeathTest::AssumeRole() {
 
   // Create a child job.
   zx_handle_t child_job = ZX_HANDLE_INVALID;
-  status = zx_job_create(zx_job_default(), 0, & child_job);
+  status = zx_job_create(zx_job_default(), 0, &child_job);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
   zx_policy_basic_t policy;
   policy.condition = ZX_POL_NEW_ANY;
   policy.policy = ZX_POL_ACTION_ALLOW;
-  status = zx_job_set_policy(
-      child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC, &policy, 1);
+  status = zx_job_set_policy(child_job, ZX_JOB_POL_RELATIVE, ZX_JOB_POL_BASIC,
+                             &policy, 1);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Create an exception channel attached to the |child_job|, to allow
   // us to suppress the system default exception handler from firing.
-  status =
-      zx_task_create_exception_channel(
-          child_job, 0, exception_channel_.reset_and_get_address());
+  status = zx_task_create_exception_channel(
+      child_job, 0, exception_channel_.reset_and_get_address());
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   // Spawn the child process.
-  status = fdio_spawn_etc(
-      child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0], args.Argv(), nullptr,
-      2, spawn_actions, child_process_.reset_and_get_address(), nullptr);
+  status = fdio_spawn_etc(child_job, FDIO_SPAWN_CLONE_ALL, args.Argv()[0],
+                          args.Argv(), nullptr, 2, spawn_actions,
+                          child_process_.reset_and_get_address(), nullptr);
   GTEST_DEATH_TEST_CHECK_(status == ZX_OK);
 
   set_spawned(true);
   return OVERSEE_TEST;
 }
 
-std::string FuchsiaDeathTest::GetErrorLogs() {
-  return captured_stderr_;
-}
+std::string FuchsiaDeathTest::GetErrorLogs() { return captured_stderr_; }
 
 #else  // We are neither on Windows, nor on Fuchsia.
 
@@ -1093,8 +1081,7 @@ ForkingDeathTest::ForkingDeathTest(const char* a_statement,
 // status, or 0 if no child process exists.  As a side effect, sets the
 // outcome data member.
 int ForkingDeathTest::Wait() {
-  if (!spawned())
-    return 0;
+  if (!spawned()) return 0;
 
   ReadAndInterpretStatusByte();
 
@@ -1173,11 +1160,11 @@ class ExecDeathTest : public ForkingDeathTest {
  private:
   static ::std::vector<std::string> GetArgvsForDeathTestChildProcess() {
     ::std::vector<std::string> args = GetInjectableArgvs();
-#  if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#if defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     ::std::vector<std::string> extra_args =
         GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_();
     args.insert(args.end(), extra_args.begin(), extra_args.end());
-#  endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_COMMAND_LINE_ARGS_)
     return args;
   }
   // The name of the file in which the death test is located.
@@ -1204,14 +1191,11 @@ class Arguments {
   template <typename Str>
   void AddArguments(const ::std::vector<Str>& arguments) {
     for (typename ::std::vector<Str>::const_iterator i = arguments.begin();
-         i != arguments.end();
-         ++i) {
+         i != arguments.end(); ++i) {
       args_.insert(args_.end() - 1, posix::StrDup(i->c_str()));
     }
   }
-  char* const* Argv() {
-    return &args_[0];
-  }
+  char* const* Argv() { return &args_[0]; }
 
  private:
   std::vector<char*> args_;
@@ -1224,9 +1208,9 @@ struct ExecDeathTestArgs {
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
 extern "C" char** environ;
-#  else  // GTEST_OS_QNX
+#else   // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1241,8 +1225,8 @@ static int ExecDeathTestChildMain(void* child_arg) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
@@ -1253,13 +1237,12 @@ static int ExecDeathTestChildMain(void* child_arg) {
   // one path separator.
   execv(args->argv[0], args->argv);
   DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
-                 original_dir + " failed: " +
-                 GetLastErrnoDescription());
+                 original_dir + " failed: " + GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // GTEST_OS_QNX
+#endif  // GTEST_OS_QNX
 
-#  if GTEST_HAS_CLONE
+#if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
 // grows.
 // This could be accomplished more elegantly by a single recursive
@@ -1293,7 +1276,7 @@ static bool StackGrowsDown() {
   StackLowerThanAddress(&dummy, &result);
   return result;
 }
-#  endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
 // Spawns a child process with the same executable as the current process in
 // a thread-safe manner and instructs it to run the death test.  The
@@ -1303,10 +1286,10 @@ static bool StackGrowsDown() {
 // spawn(2) there instead.  The function dies with an error message if
 // anything goes wrong.
 static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
-  ExecDeathTestArgs args = { argv, close_fd };
+  ExecDeathTestArgs args = {argv, close_fd};
   pid_t child_pid = -1;
 
-#  if GTEST_OS_QNX
+#if GTEST_OS_QNX
   // Obtains the current directory and sets it to be closed in the child
   // process.
   const int cwd_fd = open(".", O_RDONLY);
@@ -1319,16 +1302,16 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
       UnitTest::GetInstance()->original_working_dir();
   // We can safely call chdir() as it's a direct system call.
   if (chdir(original_dir) != 0) {
-    DeathTestAbort(std::string("chdir(\"") + original_dir + "\") failed: " +
-                   GetLastErrnoDescription());
+    DeathTestAbort(std::string("chdir(\"") + original_dir +
+                   "\") failed: " + GetLastErrnoDescription());
     return EXIT_FAILURE;
   }
 
   int fd_flags;
   // Set close_fd to be closed after spawn.
   GTEST_DEATH_TEST_CHECK_SYSCALL_(fd_flags = fcntl(close_fd, F_GETFD));
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(fcntl(close_fd, F_SETFD,
-                                        fd_flags | FD_CLOEXEC));
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      fcntl(close_fd, F_SETFD, fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
   child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
@@ -1336,8 +1319,8 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
 
-#  else   // GTEST_OS_QNX
-#   if GTEST_OS_LINUX
+#else  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   // When a SIGPROF signal is received while fork() or clone() are executing,
   // the process may hang. To avoid this, we ignore SIGPROF here and re-enable
   // it after the call to fork()/clone() is complete.
@@ -1346,12 +1329,12 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
   memset(&ignore_sigprof_action, 0, sizeof(ignore_sigprof_action));
   sigemptyset(&ignore_sigprof_action.sa_mask);
   ignore_sigprof_action.sa_handler = SIG_IGN;
-  GTEST_DEATH_TEST_CHECK_SYSCALL_(sigaction(
-      SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
-#   endif  // GTEST_OS_LINUX
+  GTEST_DEATH_TEST_CHECK_SYSCALL_(
+      sigaction(SIGPROF, &ignore_sigprof_action, &saved_sigprof_action));
+#endif  // GTEST_OS_LINUX
 
-#   if GTEST_HAS_CLONE
-  const bool use_fork = GTEST_FLAG(death_test_use_fork);
+#if GTEST_HAS_CLONE
+  const bool use_fork = GTEST_FLAG_GET(death_test_use_fork);
 
   if (!use_fork) {
     static const bool stack_grows_down = StackGrowsDown();
@@ -1370,7 +1353,7 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
     const size_t kMaxStackAlignment = 64;
     void* const stack_top =
         static_cast<char*>(stack) +
-            (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
+        (stack_grows_down ? stack_size - kMaxStackAlignment : 0);
     GTEST_DEATH_TEST_CHECK_(
         static_cast<size_t>(stack_size) > kMaxStackAlignment &&
         reinterpret_cast<uintptr_t>(stack_top) % kMaxStackAlignment == 0);
@@ -1379,19 +1362,19 @@ static pid_t ExecDeathTestSpawnChild(char* const* argv, int close_fd) {
 
     GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1);
   }
-#   else
+#else
   const bool use_fork = true;
-#   endif  // GTEST_HAS_CLONE
+#endif  // GTEST_HAS_CLONE
 
   if (use_fork && (child_pid = fork()) == 0) {
-      ExecDeathTestChildMain(&args);
-      _exit(0);
+    ExecDeathTestChildMain(&args);
+    _exit(0);
   }
-#  endif  // GTEST_OS_QNX
-#  if GTEST_OS_LINUX
+#endif  // GTEST_OS_QNX
+#if GTEST_OS_LINUX
   GTEST_DEATH_TEST_CHECK_SYSCALL_(
       sigaction(SIGPROF, &saved_sigprof_action, nullptr));
-#  endif  // GTEST_OS_LINUX
+#endif  // GTEST_OS_LINUX
 
   GTEST_DEATH_TEST_CHECK_(child_pid != -1);
   return child_pid;
@@ -1420,13 +1403,13 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1);
 
   const std::string filter_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
-                                  kFilterFlag + "=" + info->test_suite_name() +
-                                  "." + info->name();
-  const std::string internal_flag =
-      std::string("--") + GTEST_FLAG_PREFIX_ + kInternalRunDeathTestFlag + "="
-      + file_ + "|" + StreamableToString(line_) + "|"
-      + StreamableToString(death_test_index) + "|"
-      + StreamableToString(pipe_fd[1]);
+                                  "filter=" + info->test_suite_name() + "." +
+                                  info->name();
+  const std::string internal_flag = std::string("--") + GTEST_FLAG_PREFIX_ +
+                                    "internal_run_death_test=" + file_ + "|" +
+                                    StreamableToString(line_) + "|" +
+                                    StreamableToString(death_test_index) + "|" +
+                                    StreamableToString(pipe_fd[1]);
   Arguments args;
   args.AddArguments(GetArgvsForDeathTestChildProcess());
   args.AddArgument(filter_flag.c_str());
@@ -1447,7 +1430,7 @@ DeathTest::TestRole ExecDeathTest::AssumeRole() {
   return OVERSEE_TEST;
 }
 
-# endif  // !GTEST_OS_WINDOWS
+#endif  // !GTEST_OS_WINDOWS
 
 // Creates a concrete DeathTest-derived class that depends on the
 // --gtest_death_test_style flag, and sets the pointer pointed to
@@ -1461,15 +1444,15 @@ bool DefaultDeathTestFactory::Create(const char* statement,
   UnitTestImpl* const impl = GetUnitTestImpl();
   const InternalRunDeathTestFlag* const flag =
       impl->internal_run_death_test_flag();
-  const int death_test_index = impl->current_test_info()
-      ->increment_death_test_count();
+  const int death_test_index =
+      impl->current_test_info()->increment_death_test_count();
 
   if (flag != nullptr) {
     if (death_test_index > flag->index()) {
       DeathTest::set_last_death_test_message(
-          "Death test count (" + StreamableToString(death_test_index)
-          + ") somehow exceeded expected maximum ("
-          + StreamableToString(flag->index()) + ")");
+          "Death test count (" + StreamableToString(death_test_index) +
+          ") somehow exceeded expected maximum (" +
+          StreamableToString(flag->index()) + ")");
       return false;
     }
 
@@ -1480,50 +1463,50 @@ bool DefaultDeathTestFactory::Create(const char* statement,
     }
   }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new WindowsDeathTest(statement, std::move(matcher), file, line);
   }
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe" ||
-      GTEST_FLAG(death_test_style) == "fast") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe" ||
+      GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new FuchsiaDeathTest(statement, std::move(matcher), file, line);
   }
 
-# else
+#else
 
-  if (GTEST_FLAG(death_test_style) == "threadsafe") {
+  if (GTEST_FLAG_GET(death_test_style) == "threadsafe") {
     *test = new ExecDeathTest(statement, std::move(matcher), file, line);
-  } else if (GTEST_FLAG(death_test_style) == "fast") {
+  } else if (GTEST_FLAG_GET(death_test_style) == "fast") {
     *test = new NoExecDeathTest(statement, std::move(matcher));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   else {  // NOLINT - this is more readable than unbalanced brackets inside #if.
-    DeathTest::set_last_death_test_message(
-        "Unknown death test style \"" + GTEST_FLAG(death_test_style)
-        + "\" encountered");
+    DeathTest::set_last_death_test_message("Unknown death test style \"" +
+                                           GTEST_FLAG_GET(death_test_style) +
+                                           "\" encountered");
     return false;
   }
 
   return true;
 }
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 // Recreates the pipe and event handles from the provided parameters,
 // signals the event, and returns a file descriptor wrapped around the pipe
 // handle. This function is called in the child process only.
 static int GetStatusFileDescriptor(unsigned int parent_process_id,
-                            size_t write_handle_as_size_t,
-                            size_t event_handle_as_size_t) {
+                                   size_t write_handle_as_size_t,
+                                   size_t event_handle_as_size_t) {
   AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE,
-                                                   FALSE,  // Non-inheritable.
-                                                   parent_process_id));
+                                                 FALSE,  // Non-inheritable.
+                                                 parent_process_id));
   if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) {
     DeathTestAbort("Unable to open parent process " +
                    StreamableToString(parent_process_id));
@@ -1531,8 +1514,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t));
 
-  const HANDLE write_handle =
-      reinterpret_cast<HANDLE>(write_handle_as_size_t);
+  const HANDLE write_handle = reinterpret_cast<HANDLE>(write_handle_as_size_t);
   HANDLE dup_write_handle;
 
   // The newly initialized handle is accessible only in the parent
@@ -1554,9 +1536,7 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
   HANDLE dup_event_handle;
 
   if (!::DuplicateHandle(parent_process_handle.Get(), event_handle,
-                         ::GetCurrentProcess(), &dup_event_handle,
-                         0x0,
-                         FALSE,
+                         ::GetCurrentProcess(), &dup_event_handle, 0x0, FALSE,
                          DUPLICATE_SAME_ACCESS)) {
     DeathTestAbort("Unable to duplicate the event handle " +
                    StreamableToString(event_handle_as_size_t) +
@@ -1578,61 +1558,57 @@ static int GetStatusFileDescriptor(unsigned int parent_process_id,
 
   return write_fd;
 }
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
 // Returns a newly created InternalRunDeathTestFlag object with fields
 // initialized from the GTEST_FLAG(internal_run_death_test) flag if
 // the flag is specified; otherwise returns NULL.
 InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() {
-  if (GTEST_FLAG(internal_run_death_test) == "") return nullptr;
+  if (GTEST_FLAG_GET(internal_run_death_test) == "") return nullptr;
 
   // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we
   // can use it here.
   int line = -1;
   int index = -1;
   ::std::vector< ::std::string> fields;
-  SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields);
+  SplitString(GTEST_FLAG_GET(internal_run_death_test), '|', &fields);
   int write_fd = -1;
 
-# if GTEST_OS_WINDOWS
+#if GTEST_OS_WINDOWS
 
   unsigned int parent_process_id = 0;
   size_t write_handle_as_size_t = 0;
   size_t event_handle_as_size_t = 0;
 
-  if (fields.size() != 6
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &parent_process_id)
-      || !ParseNaturalNumber(fields[4], &write_handle_as_size_t)
-      || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
+  if (fields.size() != 6 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &parent_process_id) ||
+      !ParseNaturalNumber(fields[4], &write_handle_as_size_t) ||
+      !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) {
     DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
-                   GTEST_FLAG(internal_run_death_test));
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
-  write_fd = GetStatusFileDescriptor(parent_process_id,
-                                     write_handle_as_size_t,
+  write_fd = GetStatusFileDescriptor(parent_process_id, write_handle_as_size_t,
                                      event_handle_as_size_t);
 
-# elif GTEST_OS_FUCHSIA
+#elif GTEST_OS_FUCHSIA
 
-  if (fields.size() != 3
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 3 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# else
+#else
 
-  if (fields.size() != 4
-      || !ParseNaturalNumber(fields[1], &line)
-      || !ParseNaturalNumber(fields[2], &index)
-      || !ParseNaturalNumber(fields[3], &write_fd)) {
-    DeathTestAbort("Bad --gtest_internal_run_death_test flag: "
-        + GTEST_FLAG(internal_run_death_test));
+  if (fields.size() != 4 || !ParseNaturalNumber(fields[1], &line) ||
+      !ParseNaturalNumber(fields[2], &index) ||
+      !ParseNaturalNumber(fields[3], &write_fd)) {
+    DeathTestAbort("Bad --gtest_internal_run_death_test flag: " +
+                   GTEST_FLAG_GET(internal_run_death_test));
   }
 
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
 
   return new InternalRunDeathTestFlag(fields[0], line, index, write_fd);
 }
diff --git a/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libvpx/third_party/googletest/src/src/gtest-filepath.cc
index 0b5629401..f6ee90cdb 100644
--- a/libvpx/third_party/googletest/src/src/gtest-filepath.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-filepath.cc
@@ -30,29 +30,31 @@
 #include "gtest/internal/gtest-filepath.h"
 
 #include <stdlib.h>
-#include "gtest/internal/gtest-port.h"
+
 #include "gtest/gtest-message.h"
+#include "gtest/internal/gtest-port.h"
 
 #if GTEST_OS_WINDOWS_MOBILE
-# include <windows.h>
+#include <windows.h>
 #elif GTEST_OS_WINDOWS
-# include <direct.h>
-# include <io.h>
+#include <direct.h>
+#include <io.h>
 #else
-# include <limits.h>
-# include <climits>  // Some Linux distributions define PATH_MAX here.
-#endif  // GTEST_OS_WINDOWS_MOBILE
+#include <limits.h>
+
+#include <climits>  // Some Linux distributions define PATH_MAX here.
+#endif              // GTEST_OS_WINDOWS_MOBILE
 
 #include "gtest/internal/gtest-string.h"
 
 #if GTEST_OS_WINDOWS
-# define GTEST_PATH_MAX_ _MAX_PATH
+#define GTEST_PATH_MAX_ _MAX_PATH
 #elif defined(PATH_MAX)
-# define GTEST_PATH_MAX_ PATH_MAX
+#define GTEST_PATH_MAX_ PATH_MAX
 #elif defined(_XOPEN_PATH_MAX)
-# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
+#define GTEST_PATH_MAX_ _XOPEN_PATH_MAX
 #else
-# define GTEST_PATH_MAX_ _POSIX_PATH_MAX
+#define GTEST_PATH_MAX_ _POSIX_PATH_MAX
 #endif  // GTEST_OS_WINDOWS
 
 namespace testing {
@@ -66,16 +68,16 @@ namespace internal {
 const char kPathSeparator = '\\';
 const char kAlternatePathSeparator = '/';
 const char kAlternatePathSeparatorString[] = "/";
-# if GTEST_OS_WINDOWS_MOBILE
+#if GTEST_OS_WINDOWS_MOBILE
 // Windows CE doesn't have a current directory. You should not use
 // the current directory in tests on Windows CE, but this at least
 // provides a reasonable fallback.
 const char kCurrentDirectoryString[] = "\\";
 // Windows CE doesn't define INVALID_FILE_ATTRIBUTES
 const DWORD kInvalidFileAttributes = 0xffffffff;
-# else
+#else
 const char kCurrentDirectoryString[] = ".\\";
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 #else
 const char kPathSeparator = '/';
 const char kCurrentDirectoryString[] = "./";
@@ -99,17 +101,17 @@ FilePath FilePath::GetCurrentDir() {
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
 #elif GTEST_OS_WINDOWS
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   return FilePath(_getcwd(cwd, sizeof(cwd)) == nullptr ? "" : cwd);
 #else
-  char cwd[GTEST_PATH_MAX_ + 1] = { '\0' };
+  char cwd[GTEST_PATH_MAX_ + 1] = {'\0'};
   char* result = getcwd(cwd, sizeof(cwd));
-# if GTEST_OS_NACL
+#if GTEST_OS_NACL
   // getcwd will likely fail in NaCl due to the sandbox, so return something
   // reasonable. The user may have provided a shim implementation for getcwd,
   // however, so fallback only when failure is detected.
   return FilePath(result == nullptr ? kCurrentDirectoryString : cwd);
-# endif  // GTEST_OS_NACL
+#endif  // GTEST_OS_NACL
   return FilePath(result == nullptr ? "" : cwd);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -121,8 +123,8 @@ FilePath FilePath::GetCurrentDir() {
 FilePath FilePath::RemoveExtension(const char* extension) const {
   const std::string dot_extension = std::string(".") + extension;
   if (String::EndsWithCaseInsensitive(pathname_, dot_extension)) {
-    return FilePath(pathname_.substr(
-        0, pathname_.length() - dot_extension.length()));
+    return FilePath(
+        pathname_.substr(0, pathname_.length() - dot_extension.length()));
   }
   return *this;
 }
@@ -178,15 +180,14 @@ FilePath FilePath::RemoveFileName() const {
 // than zero (e.g., 12), returns "dir/test_12.xml".
 // On Windows platform, uses \ as the separator rather than /.
 FilePath FilePath::MakeFileName(const FilePath& directory,
-                                const FilePath& base_name,
-                                int number,
+                                const FilePath& base_name, int number,
                                 const char* extension) {
   std::string file;
   if (number == 0) {
     file = base_name.string() + "." + extension;
   } else {
-    file = base_name.string() + "_" + StreamableToString(number)
-        + "." + extension;
+    file =
+        base_name.string() + "_" + StreamableToString(number) + "." + extension;
   }
   return ConcatPaths(directory, FilePath(file));
 }
@@ -195,8 +196,7 @@ FilePath FilePath::MakeFileName(const FilePath& directory,
 // On Windows, uses \ as the separator rather than /.
 FilePath FilePath::ConcatPaths(const FilePath& directory,
                                const FilePath& relative_path) {
-  if (directory.IsEmpty())
-    return relative_path;
+  if (directory.IsEmpty()) return relative_path;
   const FilePath dir(directory.RemoveTrailingPathSeparator());
   return FilePath(dir.string() + kPathSeparator + relative_path.string());
 }
@@ -207,7 +207,7 @@ bool FilePath::FileOrDirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   return attributes != kInvalidFileAttributes;
 #else
   posix::StatStruct file_stat{};
@@ -222,8 +222,8 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS
   // Don't strip off trailing separator if path is a root directory on
   // Windows (like "C:\\").
-  const FilePath& path(IsRootDirectory() ? *this :
-                                           RemoveTrailingPathSeparator());
+  const FilePath& path(IsRootDirectory() ? *this
+                                         : RemoveTrailingPathSeparator());
 #else
   const FilePath& path(*this);
 #endif
@@ -231,15 +231,15 @@ bool FilePath::DirectoryExists() const {
 #if GTEST_OS_WINDOWS_MOBILE
   LPCWSTR unicode = String::AnsiToUtf16(path.c_str());
   const DWORD attributes = GetFileAttributes(unicode);
-  delete [] unicode;
+  delete[] unicode;
   if ((attributes != kInvalidFileAttributes) &&
       (attributes & FILE_ATTRIBUTE_DIRECTORY)) {
     result = true;
   }
 #else
   posix::StatStruct file_stat{};
-  result = posix::Stat(path.c_str(), &file_stat) == 0 &&
-      posix::IsDir(file_stat);
+  result =
+      posix::Stat(path.c_str(), &file_stat) == 0 && posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
 
   return result;
@@ -260,10 +260,9 @@ bool FilePath::IsAbsolutePath() const {
   const char* const name = pathname_.c_str();
 #if GTEST_OS_WINDOWS
   return pathname_.length() >= 3 &&
-     ((name[0] >= 'a' && name[0] <= 'z') ||
-      (name[0] >= 'A' && name[0] <= 'Z')) &&
-     name[1] == ':' &&
-     IsPathSeparator(name[2]);
+         ((name[0] >= 'a' && name[0] <= 'z') ||
+          (name[0] >= 'A' && name[0] <= 'Z')) &&
+         name[1] == ':' && IsPathSeparator(name[2]);
 #else
   return IsPathSeparator(name[0]);
 #endif
@@ -321,7 +320,7 @@ bool FilePath::CreateFolder() const {
   FilePath removed_sep(this->RemoveTrailingPathSeparator());
   LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str());
   int result = CreateDirectory(unicode, nullptr) ? 0 : -1;
-  delete [] unicode;
+  delete[] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
 #elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
@@ -341,9 +340,8 @@ bool FilePath::CreateFolder() const {
 // name, otherwise return the name string unmodified.
 // On Windows platform, uses \ as the separator, other platforms use /.
 FilePath FilePath::RemoveTrailingPathSeparator() const {
-  return IsDirectory()
-      ? FilePath(pathname_.substr(0, pathname_.length() - 1))
-      : *this;
+  return IsDirectory() ? FilePath(pathname_.substr(0, pathname_.length() - 1))
+                       : *this;
 }
 
 // Removes any redundant separators that might be in the pathname.
diff --git a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
index 6d8cecbbb..0b9e929c6 100644
--- a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
@@ -35,7 +35,7 @@
 #define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
-# include <errno.h>
+#include <errno.h>
 #endif  // !_WIN32_WCE
 #include <stddef.h>
 #include <stdlib.h>  // For strtoll/_strtoul64/malloc/free.
@@ -50,22 +50,20 @@
 #include "gtest/internal/gtest-port.h"
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
+#include <arpa/inet.h>  // NOLINT
+#include <netdb.h>      // NOLINT
 #endif
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>  // NOLINT
-#endif  // GTEST_OS_WINDOWS
+#include <windows.h>  // NOLINT
+#endif                // GTEST_OS_WINDOWS
 
-#include "gtest/gtest.h"
 #include "gtest/gtest-spi.h"
+#include "gtest/gtest.h"
 
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
-namespace testing {
-
 // Declares the flags.
 //
 // We don't want the users to modify this flag in the code, but want
@@ -73,32 +71,13 @@ namespace testing {
 // declare it here as opposed to in gtest.h.
 GTEST_DECLARE_bool_(death_test_use_fork);
 
+namespace testing {
 namespace internal {
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
 GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest;
 
-// Names of the flags (needed for parsing Google Test flags).
-const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests";
-const char kBreakOnFailureFlag[] = "break_on_failure";
-const char kCatchExceptionsFlag[] = "catch_exceptions";
-const char kColorFlag[] = "color";
-const char kFailFast[] = "fail_fast";
-const char kFilterFlag[] = "filter";
-const char kListTestsFlag[] = "list_tests";
-const char kOutputFlag[] = "output";
-const char kBriefFlag[] = "brief";
-const char kPrintTimeFlag[] = "print_time";
-const char kPrintUTF8Flag[] = "print_utf8";
-const char kRandomSeedFlag[] = "random_seed";
-const char kRepeatFlag[] = "repeat";
-const char kShuffleFlag[] = "shuffle";
-const char kStackTraceDepthFlag[] = "stack_trace_depth";
-const char kStreamResultToFlag[] = "stream_result_to";
-const char kThrowOnFailureFlag[] = "throw_on_failure";
-const char kFlagfileFlag[] = "flagfile";
-
 // A valid random seed must be in [1, kMaxRandomSeed].
 const int kMaxRandomSeed = 99999;
 
@@ -125,21 +104,21 @@ GTEST_API_ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms);
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-GTEST_API_ bool ParseInt32Flag(
-    const char* str, const char* flag, int32_t* value);
+GTEST_API_ bool ParseFlag(const char* str, const char* flag, int32_t* value);
 
 // Returns a random seed in range [1, kMaxRandomSeed] based on the
 // given --gtest_random_seed flag value.
 inline int GetRandomSeedFromFlag(int32_t random_seed_flag) {
-  const unsigned int raw_seed = (random_seed_flag == 0) ?
-      static_cast<unsigned int>(GetTimeInMillis()) :
-      static_cast<unsigned int>(random_seed_flag);
+  const unsigned int raw_seed =
+      (random_seed_flag == 0) ? static_cast<unsigned int>(GetTimeInMillis())
+                              : static_cast<unsigned int>(random_seed_flag);
 
   // Normalizes the actual seed to range [1, kMaxRandomSeed] such that
   // it's easy to type.
   const int normalized_seed =
       static_cast<int>((raw_seed - 1U) %
-                       static_cast<unsigned int>(kMaxRandomSeed)) + 1;
+                       static_cast<unsigned int>(kMaxRandomSeed)) +
+      1;
   return normalized_seed;
 }
 
@@ -160,50 +139,54 @@ class GTestFlagSaver {
  public:
   // The c'tor.
   GTestFlagSaver() {
-    also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests);
-    break_on_failure_ = GTEST_FLAG(break_on_failure);
-    catch_exceptions_ = GTEST_FLAG(catch_exceptions);
-    color_ = GTEST_FLAG(color);
-    death_test_style_ = GTEST_FLAG(death_test_style);
-    death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
-    fail_fast_ = GTEST_FLAG(fail_fast);
-    filter_ = GTEST_FLAG(filter);
-    internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
-    list_tests_ = GTEST_FLAG(list_tests);
-    output_ = GTEST_FLAG(output);
-    brief_ = GTEST_FLAG(brief);
-    print_time_ = GTEST_FLAG(print_time);
-    print_utf8_ = GTEST_FLAG(print_utf8);
-    random_seed_ = GTEST_FLAG(random_seed);
-    repeat_ = GTEST_FLAG(repeat);
-    shuffle_ = GTEST_FLAG(shuffle);
-    stack_trace_depth_ = GTEST_FLAG(stack_trace_depth);
-    stream_result_to_ = GTEST_FLAG(stream_result_to);
-    throw_on_failure_ = GTEST_FLAG(throw_on_failure);
+    also_run_disabled_tests_ = GTEST_FLAG_GET(also_run_disabled_tests);
+    break_on_failure_ = GTEST_FLAG_GET(break_on_failure);
+    catch_exceptions_ = GTEST_FLAG_GET(catch_exceptions);
+    color_ = GTEST_FLAG_GET(color);
+    death_test_style_ = GTEST_FLAG_GET(death_test_style);
+    death_test_use_fork_ = GTEST_FLAG_GET(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG_GET(fail_fast);
+    filter_ = GTEST_FLAG_GET(filter);
+    internal_run_death_test_ = GTEST_FLAG_GET(internal_run_death_test);
+    list_tests_ = GTEST_FLAG_GET(list_tests);
+    output_ = GTEST_FLAG_GET(output);
+    brief_ = GTEST_FLAG_GET(brief);
+    print_time_ = GTEST_FLAG_GET(print_time);
+    print_utf8_ = GTEST_FLAG_GET(print_utf8);
+    random_seed_ = GTEST_FLAG_GET(random_seed);
+    repeat_ = GTEST_FLAG_GET(repeat);
+    recreate_environments_when_repeating_ =
+        GTEST_FLAG_GET(recreate_environments_when_repeating);
+    shuffle_ = GTEST_FLAG_GET(shuffle);
+    stack_trace_depth_ = GTEST_FLAG_GET(stack_trace_depth);
+    stream_result_to_ = GTEST_FLAG_GET(stream_result_to);
+    throw_on_failure_ = GTEST_FLAG_GET(throw_on_failure);
   }
 
   // The d'tor is not virtual.  DO NOT INHERIT FROM THIS CLASS.
   ~GTestFlagSaver() {
-    GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_;
-    GTEST_FLAG(break_on_failure) = break_on_failure_;
-    GTEST_FLAG(catch_exceptions) = catch_exceptions_;
-    GTEST_FLAG(color) = color_;
-    GTEST_FLAG(death_test_style) = death_test_style_;
-    GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
-    GTEST_FLAG(filter) = filter_;
-    GTEST_FLAG(fail_fast) = fail_fast_;
-    GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
-    GTEST_FLAG(list_tests) = list_tests_;
-    GTEST_FLAG(output) = output_;
-    GTEST_FLAG(brief) = brief_;
-    GTEST_FLAG(print_time) = print_time_;
-    GTEST_FLAG(print_utf8) = print_utf8_;
-    GTEST_FLAG(random_seed) = random_seed_;
-    GTEST_FLAG(repeat) = repeat_;
-    GTEST_FLAG(shuffle) = shuffle_;
-    GTEST_FLAG(stack_trace_depth) = stack_trace_depth_;
-    GTEST_FLAG(stream_result_to) = stream_result_to_;
-    GTEST_FLAG(throw_on_failure) = throw_on_failure_;
+    GTEST_FLAG_SET(also_run_disabled_tests, also_run_disabled_tests_);
+    GTEST_FLAG_SET(break_on_failure, break_on_failure_);
+    GTEST_FLAG_SET(catch_exceptions, catch_exceptions_);
+    GTEST_FLAG_SET(color, color_);
+    GTEST_FLAG_SET(death_test_style, death_test_style_);
+    GTEST_FLAG_SET(death_test_use_fork, death_test_use_fork_);
+    GTEST_FLAG_SET(filter, filter_);
+    GTEST_FLAG_SET(fail_fast, fail_fast_);
+    GTEST_FLAG_SET(internal_run_death_test, internal_run_death_test_);
+    GTEST_FLAG_SET(list_tests, list_tests_);
+    GTEST_FLAG_SET(output, output_);
+    GTEST_FLAG_SET(brief, brief_);
+    GTEST_FLAG_SET(print_time, print_time_);
+    GTEST_FLAG_SET(print_utf8, print_utf8_);
+    GTEST_FLAG_SET(random_seed, random_seed_);
+    GTEST_FLAG_SET(repeat, repeat_);
+    GTEST_FLAG_SET(recreate_environments_when_repeating,
+                   recreate_environments_when_repeating_);
+    GTEST_FLAG_SET(shuffle, shuffle_);
+    GTEST_FLAG_SET(stack_trace_depth, stack_trace_depth_);
+    GTEST_FLAG_SET(stream_result_to, stream_result_to_);
+    GTEST_FLAG_SET(throw_on_failure, throw_on_failure_);
   }
 
  private:
@@ -224,6 +207,7 @@ class GTestFlagSaver {
   bool print_utf8_;
   int32_t random_seed_;
   int32_t repeat_;
+  bool recreate_environments_when_repeating_;
   bool shuffle_;
   int32_t stack_trace_depth_;
   std::string stream_result_to_;
@@ -278,8 +262,8 @@ GTEST_API_ int32_t Int32FromEnvOrDie(const char* env_var, int32_t default_val);
 // returns true if and only if the test should be run on this shard. The test id
 // is some arbitrary but unique non-negative integer assigned to each test
 // method. Assumes that 0 <= shard_index < total_shards.
-GTEST_API_ bool ShouldRunTestOnShard(
-    int total_shards, int shard_index, int test_id);
+GTEST_API_ bool ShouldRunTestOnShard(int total_shards, int shard_index,
+                                     int test_id);
 
 // STL container utilities.
 
@@ -290,9 +274,8 @@ inline int CountIf(const Container& c, Predicate predicate) {
   // Implemented as an explicit loop since std::count_if() in libCstd on
   // Solaris has a non-standard signature.
   int count = 0;
-  for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) {
-    if (predicate(*it))
-      ++count;
+  for (auto it = c.begin(); it != c.end(); ++it) {
+    if (predicate(*it)) ++count;
   }
   return count;
 }
@@ -441,7 +424,9 @@ class OsStackTraceGetterInterface {
   static const char* const kElidedFramesMarker;
 
  private:
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface);
+  OsStackTraceGetterInterface(const OsStackTraceGetterInterface&) = delete;
+  OsStackTraceGetterInterface& operator=(const OsStackTraceGetterInterface&) =
+      delete;
 };
 
 // A working implementation of the OsStackTraceGetterInterface interface.
@@ -463,7 +448,8 @@ class OsStackTraceGetter : public OsStackTraceGetterInterface {
   void* caller_frame_ = nullptr;
 #endif  // GTEST_HAS_ABSL
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter);
+  OsStackTraceGetter(const OsStackTraceGetter&) = delete;
+  OsStackTraceGetter& operator=(const OsStackTraceGetter&) = delete;
 };
 
 // Information about a Google Test trace point.
@@ -476,7 +462,7 @@ struct TraceInfo {
 // This is the default global test part result reporter used in UnitTestImpl.
 // This class should only be used by UnitTestImpl.
 class DefaultGlobalTestPartResultReporter
-  : public TestPartResultReporterInterface {
+    : public TestPartResultReporterInterface {
  public:
   explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test);
   // Implements the TestPartResultReporterInterface. Reports the test part
@@ -486,7 +472,10 @@ class DefaultGlobalTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter);
+  DefaultGlobalTestPartResultReporter(
+      const DefaultGlobalTestPartResultReporter&) = delete;
+  DefaultGlobalTestPartResultReporter& operator=(
+      const DefaultGlobalTestPartResultReporter&) = delete;
 };
 
 // This is the default per thread test part result reporter used in
@@ -502,7 +491,10 @@ class DefaultPerThreadTestPartResultReporter
  private:
   UnitTestImpl* const unit_test_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter);
+  DefaultPerThreadTestPartResultReporter(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
+  DefaultPerThreadTestPartResultReporter& operator=(
+      const DefaultPerThreadTestPartResultReporter&) = delete;
 };
 
 // The private implementation of the UnitTest class.  We don't protect
@@ -640,7 +632,8 @@ class GTEST_API_ UnitTestImpl {
   // For example, if Foo() calls Bar(), which in turn calls
   // CurrentOsStackTraceExceptTop(1), Foo() will be included in the
   // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
-  std::string CurrentOsStackTraceExceptTop(int skip_count) GTEST_NO_INLINE_;
+  std::string CurrentOsStackTraceExceptTop(int skip_count)
+      GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_;
 
   // Finds and returns a TestSuite with the given name.  If one doesn't
   // exist, creates one and returns it.
@@ -744,9 +737,7 @@ class GTEST_API_ UnitTestImpl {
   }
 
   // Clears the results of ad-hoc test assertions.
-  void ClearAdHocTestResult() {
-    ad_hoc_test_result_.Clear();
-  }
+  void ClearAdHocTestResult() { ad_hoc_test_result_.Clear(); }
 
   // Adds a TestProperty to the current TestResult object when invoked in a
   // context of a test or a test suite, or to the global property set. If the
@@ -754,10 +745,7 @@ class GTEST_API_ UnitTestImpl {
   // updated.
   void RecordProperty(const TestProperty& test_property);
 
-  enum ReactionToSharding {
-    HONOR_SHARDING_PROTOCOL,
-    IGNORE_SHARDING_PROTOCOL
-  };
+  enum ReactionToSharding { HONOR_SHARDING_PROTOCOL, IGNORE_SHARDING_PROTOCOL };
 
   // Matches the full name of each test against the user-specified
   // filter to decide whether the test should run, then records the
@@ -963,7 +951,8 @@ class GTEST_API_ UnitTestImpl {
   // starts.
   bool catch_exceptions_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl);
+  UnitTestImpl(const UnitTestImpl&) = delete;
+  UnitTestImpl& operator=(const UnitTestImpl&) = delete;
 };  // class UnitTestImpl
 
 // Convenience function for accessing the global UnitTest
@@ -986,8 +975,9 @@ GTEST_API_ bool IsValidEscape(char ch);
 GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch);
 GTEST_API_ bool ValidateRegex(const char* regex);
 GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str);
-GTEST_API_ bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char ch, char repeat, const char* regex, const char* str);
+GTEST_API_ bool MatchRepetitionAndRegexAtHead(bool escaped, char ch,
+                                              char repeat, const char* regex,
+                                              const char* str);
 GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str);
 
 #endif  // GTEST_USES_SIMPLE_RE
@@ -1089,8 +1079,7 @@ class StreamingListener : public EmptyTestEventListener {
     }
 
     ~SocketWriter() override {
-      if (sockfd_ != -1)
-        CloseConnection();
+      if (sockfd_ != -1) CloseConnection();
     }
 
     // Sends a string to the socket.
@@ -1100,9 +1089,8 @@ class StreamingListener : public EmptyTestEventListener {
 
       const auto len = static_cast<size_t>(message.length());
       if (write(sockfd_, message.c_str(), len) != static_cast<ssize_t>(len)) {
-        GTEST_LOG_(WARNING)
-            << "stream_result_to: failed to stream to "
-            << host_name_ << ":" << port_num_;
+        GTEST_LOG_(WARNING) << "stream_result_to: failed to stream to "
+                            << host_name_ << ":" << port_num_;
       }
     }
 
@@ -1123,7 +1111,8 @@ class StreamingListener : public EmptyTestEventListener {
     const std::string host_name_;
     const std::string port_num_;
 
-    GTEST_DISALLOW_COPY_AND_ASSIGN_(SocketWriter);
+    SocketWriter(const SocketWriter&) = delete;
+    SocketWriter& operator=(const SocketWriter&) = delete;
   };  // class SocketWriter
 
   // Escapes '=', '&', '%', and '\n' characters in str as "%xx".
@@ -1135,7 +1124,9 @@ class StreamingListener : public EmptyTestEventListener {
   }
 
   explicit StreamingListener(AbstractSocketWriter* socket_writer)
-      : socket_writer_(socket_writer) { Start(); }
+      : socket_writer_(socket_writer) {
+    Start();
+  }
 
   void OnTestProgramStart(const UnitTest& /* unit_test */) override {
     SendLn("event=TestProgramStart");
@@ -1158,22 +1149,22 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestIterationEnd(const UnitTest& unit_test,
                           int /* iteration */) override {
-    SendLn("event=TestIterationEnd&passed=" +
-           FormatBool(unit_test.Passed()) + "&elapsed_time=" +
-           StreamableToString(unit_test.elapsed_time()) + "ms");
+    SendLn("event=TestIterationEnd&passed=" + FormatBool(unit_test.Passed()) +
+           "&elapsed_time=" + StreamableToString(unit_test.elapsed_time()) +
+           "ms");
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseStart(const TestCase& test_case) override {
-    SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
+  void OnTestSuiteStart(const TestSuite& test_suite) override {
+    SendLn(std::string("event=TestCaseStart&name=") + test_suite.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
   // "case" for compatibility
-  void OnTestCaseEnd(const TestCase& test_case) override {
-    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
-           "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
+  void OnTestSuiteEnd(const TestSuite& test_suite) override {
+    SendLn("event=TestCaseEnd&passed=" + FormatBool(test_suite.Passed()) +
+           "&elapsed_time=" + StreamableToString(test_suite.elapsed_time()) +
            "ms");
   }
 
@@ -1183,8 +1174,7 @@ class StreamingListener : public EmptyTestEventListener {
 
   void OnTestEnd(const TestInfo& test_info) override {
     SendLn("event=TestEnd&passed=" +
-           FormatBool((test_info.result())->Passed()) +
-           "&elapsed_time=" +
+           FormatBool((test_info.result())->Passed()) + "&elapsed_time=" +
            StreamableToString((test_info.result())->elapsed_time()) + "ms");
   }
 
@@ -1208,7 +1198,8 @@ class StreamingListener : public EmptyTestEventListener {
 
   const std::unique_ptr<AbstractSocketWriter> socket_writer_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener);
+  StreamingListener(const StreamingListener&) = delete;
+  StreamingListener& operator=(const StreamingListener&) = delete;
 };  // class StreamingListener
 
 #endif  // GTEST_CAN_STREAM_RESULTS_
diff --git a/libvpx/third_party/googletest/src/src/gtest-matchers.cc b/libvpx/third_party/googletest/src/src/gtest-matchers.cc
index 65104ebab..7e3bcc0cf 100644
--- a/libvpx/third_party/googletest/src/src/gtest-matchers.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-matchers.cc
@@ -32,12 +32,13 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-#include "gtest/internal/gtest-internal.h"
-#include "gtest/internal/gtest-port.h"
 #include "gtest/gtest-matchers.h"
 
 #include <string>
 
+#include "gtest/internal/gtest-internal.h"
+#include "gtest/internal/gtest-port.h"
+
 namespace testing {
 
 // Constructs a matcher that matches a const std::string& whose value is
diff --git a/libvpx/third_party/googletest/src/src/gtest-port.cc b/libvpx/third_party/googletest/src/src/gtest-port.cc
index 53a4d37f9..d797fe4d5 100644
--- a/libvpx/third_party/googletest/src/src/gtest-port.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-port.cc
@@ -27,61 +27,62 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/internal/gtest-port.h"
 
 #include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #include <cstdint>
 #include <fstream>
 #include <memory>
 
 #if GTEST_OS_WINDOWS
-# include <windows.h>
-# include <io.h>
-# include <sys/stat.h>
-# include <map>  // Used in ThreadLocal.
-# ifdef _MSC_VER
-#  include <crtdbg.h>
-# endif  // _MSC_VER
+#include <io.h>
+#include <sys/stat.h>
+#include <windows.h>
+
+#include <map>  // Used in ThreadLocal.
+#ifdef _MSC_VER
+#include <crtdbg.h>
+#endif  // _MSC_VER
 #else
-# include <unistd.h>
+#include <unistd.h>
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
-# include <mach/mach_init.h>
-# include <mach/task.h>
-# include <mach/vm_map.h>
+#include <mach/mach_init.h>
+#include <mach/task.h>
+#include <mach/vm_map.h>
 #endif  // GTEST_OS_MAC
 
 #if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
     GTEST_OS_NETBSD || GTEST_OS_OPENBSD
-# include <sys/sysctl.h>
-# if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
-#  include <sys/user.h>
-# endif
+#include <sys/sysctl.h>
+#if GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD
+#include <sys/user.h>
+#endif
 #endif
 
 #if GTEST_OS_QNX
-# include <devctl.h>
-# include <fcntl.h>
-# include <sys/procfs.h>
+#include <devctl.h>
+#include <fcntl.h>
+#include <sys/procfs.h>
 #endif  // GTEST_OS_QNX
 
 #if GTEST_OS_AIX
-# include <procinfo.h>
-# include <sys/types.h>
+#include <procinfo.h>
+#include <sys/types.h>
 #endif  // GTEST_OS_AIX
 
 #if GTEST_OS_FUCHSIA
-# include <zircon/process.h>
-# include <zircon/syscalls.h>
+#include <zircon/process.h>
+#include <zircon/syscalls.h>
 #endif  // GTEST_OS_FUCHSIA
 
-#include "gtest/gtest-spi.h"
 #include "gtest/gtest-message.h"
+#include "gtest/gtest-spi.h"
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-string.h"
 #include "src/gtest-internal-inl.h"
@@ -89,16 +90,7 @@
 namespace testing {
 namespace internal {
 
-#if defined(_MSC_VER) || defined(__BORLANDC__)
-// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
-const int kStdOutFileno = 1;
-const int kStdErrFileno = 2;
-#else
-const int kStdOutFileno = STDOUT_FILENO;
-const int kStdErrFileno = STDERR_FILENO;
-#endif  // _MSC_VER
-
-#if GTEST_OS_LINUX
+#if GTEST_OS_LINUX || GTEST_OS_GNU_HURD
 
 namespace {
 template <typename T>
@@ -131,8 +123,7 @@ size_t GetThreadCount() {
   if (status == KERN_SUCCESS) {
     // task_threads allocates resources in thread_list and we need to free them
     // to avoid leaks.
-    vm_deallocate(task,
-                  reinterpret_cast<vm_address_t>(thread_list),
+    vm_deallocate(task, reinterpret_cast<vm_address_t>(thread_list),
                   sizeof(thread_t) * thread_count);
     return static_cast<size_t>(thread_count);
   } else {
@@ -141,7 +132,7 @@ size_t GetThreadCount() {
 }
 
 #elif GTEST_OS_DRAGONFLY || GTEST_OS_FREEBSD || GTEST_OS_GNU_KFREEBSD || \
-      GTEST_OS_NETBSD
+    GTEST_OS_NETBSD
 
 #if GTEST_OS_NETBSD
 #undef KERN_PROC
@@ -184,12 +175,12 @@ size_t GetThreadCount() {
 // we cannot detect it.
 size_t GetThreadCount() {
   int mib[] = {
-    CTL_KERN,
-    KERN_PROC,
-    KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
-    getpid(),
-    sizeof(struct kinfo_proc),
-    0,
+      CTL_KERN,
+      KERN_PROC,
+      KERN_PROC_PID | KERN_PROC_SHOW_THREADS,
+      getpid(),
+      sizeof(struct kinfo_proc),
+      0,
   };
   u_int miblen = sizeof(mib) / sizeof(mib[0]);
 
@@ -210,8 +201,7 @@ size_t GetThreadCount() {
   // exclude empty members
   size_t nthreads = 0;
   for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
-    if (info[i].p_tid != -1)
-      nthreads++;
+    if (info[i].p_tid != -1) nthreads++;
   }
   return nthreads;
 }
@@ -254,13 +244,9 @@ size_t GetThreadCount() {
 size_t GetThreadCount() {
   int dummy_buffer;
   size_t avail;
-  zx_status_t status = zx_object_get_info(
-      zx_process_self(),
-      ZX_INFO_PROCESS_THREADS,
-      &dummy_buffer,
-      0,
-      nullptr,
-      &avail);
+  zx_status_t status =
+      zx_object_get_info(zx_process_self(), ZX_INFO_PROCESS_THREADS,
+                         &dummy_buffer, 0, nullptr, &avail);
   if (status == ZX_OK) {
     return avail;
   } else {
@@ -280,27 +266,15 @@ size_t GetThreadCount() {
 
 #if GTEST_IS_THREADSAFE && GTEST_OS_WINDOWS
 
-void SleepMilliseconds(int n) {
-  ::Sleep(static_cast<DWORD>(n));
-}
+AutoHandle::AutoHandle() : handle_(INVALID_HANDLE_VALUE) {}
 
-AutoHandle::AutoHandle()
-    : handle_(INVALID_HANDLE_VALUE) {}
+AutoHandle::AutoHandle(Handle handle) : handle_(handle) {}
 
-AutoHandle::AutoHandle(Handle handle)
-    : handle_(handle) {}
+AutoHandle::~AutoHandle() { Reset(); }
 
-AutoHandle::~AutoHandle() {
-  Reset();
-}
-
-AutoHandle::Handle AutoHandle::Get() const {
-  return handle_;
-}
+AutoHandle::Handle AutoHandle::Get() const { return handle_; }
 
-void AutoHandle::Reset() {
-  Reset(INVALID_HANDLE_VALUE);
-}
+void AutoHandle::Reset() { Reset(INVALID_HANDLE_VALUE); }
 
 void AutoHandle::Reset(HANDLE handle) {
   // Resetting with the same handle we already own is invalid.
@@ -312,7 +286,7 @@ void AutoHandle::Reset(HANDLE handle) {
   } else {
     GTEST_CHECK_(!IsCloseable())
         << "Resetting a valid handle to itself is likely a programmer error "
-            "and thus not allowed.";
+           "and thus not allowed.";
   }
 }
 
@@ -322,23 +296,6 @@ bool AutoHandle::IsCloseable() const {
   return handle_ != nullptr && handle_ != INVALID_HANDLE_VALUE;
 }
 
-Notification::Notification()
-    : event_(::CreateEvent(nullptr,     // Default security attributes.
-                           TRUE,        // Do not reset automatically.
-                           FALSE,       // Initially unset.
-                           nullptr)) {  // Anonymous event.
-  GTEST_CHECK_(event_.Get() != nullptr);
-}
-
-void Notification::Notify() {
-  GTEST_CHECK_(::SetEvent(event_.Get()) != FALSE);
-}
-
-void Notification::WaitForNotification() {
-  GTEST_CHECK_(
-      ::WaitForSingleObject(event_.Get(), INFINITE) == WAIT_OBJECT_0);
-}
-
 Mutex::Mutex()
     : owner_thread_id_(0),
       type_(kDynamic),
@@ -391,25 +348,25 @@ namespace {
 //    MemoryIsNotDeallocated memory_is_not_deallocated;
 //    critical_section_ = new CRITICAL_SECTION;
 //
-class MemoryIsNotDeallocated
-{
+class MemoryIsNotDeallocated {
  public:
   MemoryIsNotDeallocated() : old_crtdbg_flag_(0) {
     old_crtdbg_flag_ = _CrtSetDbgFlag(_CRTDBG_REPORT_FLAG);
     // Set heap allocation block type to _IGNORE_BLOCK so that MS debug CRT
     // doesn't report mem leak if there's no matching deallocation.
-    _CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_ & ~_CRTDBG_ALLOC_MEM_DF);
   }
 
   ~MemoryIsNotDeallocated() {
     // Restore the original _CRTDBG_ALLOC_MEM_DF flag
-    _CrtSetDbgFlag(old_crtdbg_flag_);
+    (void)_CrtSetDbgFlag(old_crtdbg_flag_);
   }
 
  private:
   int old_crtdbg_flag_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MemoryIsNotDeallocated);
+  MemoryIsNotDeallocated(const MemoryIsNotDeallocated&) = delete;
+  MemoryIsNotDeallocated& operator=(const MemoryIsNotDeallocated&) = delete;
 };
 #endif  // _MSC_VER
 
@@ -435,15 +392,13 @@ void Mutex::ThreadSafeLazyInit() {
         ::InitializeCriticalSection(critical_section_);
         // Updates the critical_section_init_phase_ to 2 to signal
         // initialization complete.
-        GTEST_CHECK_(::InterlockedCompareExchange(
-                          &critical_section_init_phase_, 2L, 1L) ==
-                      1L);
+        GTEST_CHECK_(::InterlockedCompareExchange(&critical_section_init_phase_,
+                                                  2L, 1L) == 1L);
         break;
       case 1:
         // Somebody else is already initializing the mutex; spin until they
         // are done.
-        while (::InterlockedCompareExchange(&critical_section_init_phase_,
-                                            2L,
+        while (::InterlockedCompareExchange(&critical_section_init_phase_, 2L,
                                             2L) != 2L) {
           // Possibly yields the rest of the thread's time slice to other
           // threads.
@@ -488,9 +443,7 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
  private:
   struct ThreadMainParam {
     ThreadMainParam(Runnable* runnable, Notification* thread_can_start)
-        : runnable_(runnable),
-          thread_can_start_(thread_can_start) {
-    }
+        : runnable_(runnable), thread_can_start_(thread_can_start) {}
     std::unique_ptr<Runnable> runnable_;
     // Does not own.
     Notification* thread_can_start_;
@@ -508,20 +461,18 @@ class ThreadWithParamSupport : public ThreadWithParamBase {
   // Prohibit instantiation.
   ThreadWithParamSupport();
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParamSupport);
+  ThreadWithParamSupport(const ThreadWithParamSupport&) = delete;
+  ThreadWithParamSupport& operator=(const ThreadWithParamSupport&) = delete;
 };
 
 }  // namespace
 
-ThreadWithParamBase::ThreadWithParamBase(Runnable *runnable,
+ThreadWithParamBase::ThreadWithParamBase(Runnable* runnable,
                                          Notification* thread_can_start)
-      : thread_(ThreadWithParamSupport::CreateThread(runnable,
-                                                     thread_can_start)) {
-}
+    : thread_(
+          ThreadWithParamSupport::CreateThread(runnable, thread_can_start)) {}
 
-ThreadWithParamBase::~ThreadWithParamBase() {
-  Join();
-}
+ThreadWithParamBase::~ThreadWithParamBase() { Join(); }
 
 void ThreadWithParamBase::Join() {
   GTEST_CHECK_(::WaitForSingleObject(thread_.Get(), INFINITE) == WAIT_OBJECT_0)
@@ -548,8 +499,10 @@ class ThreadLocalRegistryImpl {
     ThreadIdToThreadLocals::iterator thread_local_pos =
         thread_to_thread_locals->find(current_thread);
     if (thread_local_pos == thread_to_thread_locals->end()) {
-      thread_local_pos = thread_to_thread_locals->insert(
-          std::make_pair(current_thread, ThreadLocalValues())).first;
+      thread_local_pos =
+          thread_to_thread_locals
+              ->insert(std::make_pair(current_thread, ThreadLocalValues()))
+              .first;
       StartWatcherThreadFor(current_thread);
     }
     ThreadLocalValues& thread_local_values = thread_local_pos->second;
@@ -577,9 +530,8 @@ class ThreadLocalRegistryImpl {
       ThreadIdToThreadLocals* const thread_to_thread_locals =
           GetThreadLocalsMapLocked();
       for (ThreadIdToThreadLocals::iterator it =
-          thread_to_thread_locals->begin();
-          it != thread_to_thread_locals->end();
-          ++it) {
+               thread_to_thread_locals->begin();
+           it != thread_to_thread_locals->end(); ++it) {
         ThreadLocalValues& thread_local_values = it->second;
         ThreadLocalValues::iterator value_pos =
             thread_local_values.find(thread_local_instance);
@@ -609,9 +561,8 @@ class ThreadLocalRegistryImpl {
       if (thread_local_pos != thread_to_thread_locals->end()) {
         ThreadLocalValues& thread_local_values = thread_local_pos->second;
         for (ThreadLocalValues::iterator value_pos =
-            thread_local_values.begin();
-            value_pos != thread_local_values.end();
-            ++value_pos) {
+                 thread_local_values.begin();
+             value_pos != thread_local_values.end(); ++value_pos) {
           value_holders.push_back(value_pos->second);
         }
         thread_to_thread_locals->erase(thread_local_pos);
@@ -637,9 +588,8 @@ class ThreadLocalRegistryImpl {
   static void StartWatcherThreadFor(DWORD thread_id) {
     // The returned handle will be kept in thread_map and closed by
     // watcher_thread in WatcherThreadFunc.
-    HANDLE thread = ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION,
-                                 FALSE,
-                                 thread_id);
+    HANDLE thread =
+        ::OpenThread(SYNCHRONIZE | THREAD_QUERY_INFORMATION, FALSE, thread_id);
     GTEST_CHECK_(thread != nullptr);
     // We need to pass a valid thread ID pointer into CreateThread for it
     // to work correctly under Win98.
@@ -650,7 +600,8 @@ class ThreadLocalRegistryImpl {
         &ThreadLocalRegistryImpl::WatcherThreadFunc,
         reinterpret_cast<LPVOID>(new ThreadIdAndHandle(thread_id, thread)),
         CREATE_SUSPENDED, &watcher_thread_id);
-    GTEST_CHECK_(watcher_thread != nullptr);
+    GTEST_CHECK_(watcher_thread != nullptr)
+        << "CreateThread failed with error " << ::GetLastError() << ".";
     // Give the watcher thread the same priority as ours to avoid being
     // blocked by it.
     ::SetThreadPriority(watcher_thread,
@@ -664,8 +615,7 @@ class ThreadLocalRegistryImpl {
   static DWORD WINAPI WatcherThreadFunc(LPVOID param) {
     const ThreadIdAndHandle* tah =
         reinterpret_cast<const ThreadIdAndHandle*>(param);
-    GTEST_CHECK_(
-        ::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
+    GTEST_CHECK_(::WaitForSingleObject(tah->second, INFINITE) == WAIT_OBJECT_0);
     OnThreadExit(tah->first);
     ::CloseHandle(tah->second);
     delete tah;
@@ -689,16 +639,17 @@ class ThreadLocalRegistryImpl {
 };
 
 Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(
+    Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   return ThreadLocalRegistryImpl::GetValueOnCurrentThread(
       thread_local_instance);
 }
 
 void ThreadLocalRegistry::OnThreadLocalDestroyed(
-      const ThreadLocalBase* thread_local_instance) {
+    const ThreadLocalBase* thread_local_instance) {
   ThreadLocalRegistryImpl::OnThreadLocalDestroyed(thread_local_instance);
 }
 
@@ -786,7 +737,7 @@ bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); }
 bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); }
 bool IsAsciiWordChar(char ch) {
   return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') ||
-      ('0' <= ch && ch <= '9') || ch == '_';
+         ('0' <= ch && ch <= '9') || ch == '_';
 }
 
 // Returns true if and only if "\\c" is a supported escape sequence.
@@ -799,17 +750,28 @@ bool IsValidEscape(char c) {
 bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
   if (escaped) {  // "\\p" where p is pattern_char.
     switch (pattern_char) {
-      case 'd': return IsAsciiDigit(ch);
-      case 'D': return !IsAsciiDigit(ch);
-      case 'f': return ch == '\f';
-      case 'n': return ch == '\n';
-      case 'r': return ch == '\r';
-      case 's': return IsAsciiWhiteSpace(ch);
-      case 'S': return !IsAsciiWhiteSpace(ch);
-      case 't': return ch == '\t';
-      case 'v': return ch == '\v';
-      case 'w': return IsAsciiWordChar(ch);
-      case 'W': return !IsAsciiWordChar(ch);
+      case 'd':
+        return IsAsciiDigit(ch);
+      case 'D':
+        return !IsAsciiDigit(ch);
+      case 'f':
+        return ch == '\f';
+      case 'n':
+        return ch == '\n';
+      case 'r':
+        return ch == '\r';
+      case 's':
+        return IsAsciiWhiteSpace(ch);
+      case 'S':
+        return !IsAsciiWhiteSpace(ch);
+      case 't':
+        return ch == '\t';
+      case 'v':
+        return ch == '\v';
+      case 'w':
+        return IsAsciiWordChar(ch);
+      case 'W':
+        return !IsAsciiWordChar(ch);
     }
     return IsAsciiPunct(pattern_char) && pattern_char == ch;
   }
@@ -820,7 +782,8 @@ bool AtomMatchesChar(bool escaped, char pattern_char, char ch) {
 // Helper function used by ValidateRegex() to format error messages.
 static std::string FormatRegexSyntaxError(const char* regex, int index) {
   return (Message() << "Syntax error at index " << index
-          << " in simple regular expression \"" << regex << "\": ").GetString();
+                    << " in simple regular expression \"" << regex << "\": ")
+      .GetString();
 }
 
 // Generates non-fatal failures and returns false if regex is invalid;
@@ -862,12 +825,12 @@ bool ValidateRegex(const char* regex) {
                       << "'$' can only appear at the end.";
         is_valid = false;
       } else if (IsInSet(ch, "()[]{}|")) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' is unsupported.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' is unsupported.";
         is_valid = false;
       } else if (IsRepeat(ch) && !prev_repeatable) {
-        ADD_FAILURE() << FormatRegexSyntaxError(regex, i)
-                      << "'" << ch << "' can only follow a repeatable token.";
+        ADD_FAILURE() << FormatRegexSyntaxError(regex, i) << "'" << ch
+                      << "' can only follow a repeatable token.";
         is_valid = false;
       }
 
@@ -885,12 +848,10 @@ bool ValidateRegex(const char* regex) {
 // characters to be indexable by size_t, in which case the test will
 // probably time out anyway.  We are fine with this limitation as
 // std::string has it too.
-bool MatchRepetitionAndRegexAtHead(
-    bool escaped, char c, char repeat, const char* regex,
-    const char* str) {
+bool MatchRepetitionAndRegexAtHead(bool escaped, char c, char repeat,
+                                   const char* regex, const char* str) {
   const size_t min_count = (repeat == '+') ? 1 : 0;
-  const size_t max_count = (repeat == '?') ? 1 :
-      static_cast<size_t>(-1) - 1;
+  const size_t max_count = (repeat == '?') ? 1 : static_cast<size_t>(-1) - 1;
   // We cannot call numeric_limits::max() as it conflicts with the
   // max() macro on Windows.
 
@@ -903,8 +864,7 @@ bool MatchRepetitionAndRegexAtHead(
       // greedy match.
       return true;
     }
-    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i]))
-      return false;
+    if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) return false;
   }
   return false;
 }
@@ -918,25 +878,23 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 
   // "$" only matches the end of a string.  Note that regex being
   // valid guarantees that there's nothing after "$" in it.
-  if (*regex == '$')
-    return *str == '\0';
+  if (*regex == '$') return *str == '\0';
 
   // Is the first thing in regex an escape sequence?
   const bool escaped = *regex == '\\';
-  if (escaped)
-    ++regex;
+  if (escaped) ++regex;
   if (IsRepeat(regex[1])) {
     // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so
     // here's an indirect recursion.  It terminates as the regex gets
     // shorter in each recursion.
-    return MatchRepetitionAndRegexAtHead(
-        escaped, regex[0], regex[1], regex + 2, str);
+    return MatchRepetitionAndRegexAtHead(escaped, regex[0], regex[1], regex + 2,
+                                         str);
   } else {
     // regex isn't empty, isn't "$", and doesn't start with a
     // repetition.  We match the first atom of regex with the first
     // character of str and recurse.
     return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) &&
-        MatchRegexAtHead(regex + 1, str + 1);
+           MatchRegexAtHead(regex + 1, str + 1);
   }
 }
 
@@ -951,13 +909,11 @@ bool MatchRegexAtHead(const char* regex, const char* str) {
 bool MatchRegexAnywhere(const char* regex, const char* str) {
   if (regex == nullptr || str == nullptr) return false;
 
-  if (*regex == '^')
-    return MatchRegexAtHead(regex + 1, str);
+  if (*regex == '^') return MatchRegexAtHead(regex + 1, str);
 
   // A successful match can be anywhere in str.
   do {
-    if (MatchRegexAtHead(regex, str))
-      return true;
+    if (MatchRegexAtHead(regex, str)) return true;
   } while (*str++ != '\0');
   return false;
 }
@@ -1038,8 +994,8 @@ GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) {
 // FormatFileLocation in order to contrast the two functions.
 // Note that FormatCompilerIndependentFileLocation() does NOT append colon
 // to the file location it produces, unlike FormatFileLocation().
-GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
-    const char* file, int line) {
+GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file,
+                                                               int line) {
   const std::string file_name(file == nullptr ? kUnknownFile : file);
 
   if (line < 0)
@@ -1050,12 +1006,13 @@ GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(
 
 GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line)
     : severity_(severity) {
-  const char* const marker =
-      severity == GTEST_INFO ?    "[  INFO ]" :
-      severity == GTEST_WARNING ? "[WARNING]" :
-      severity == GTEST_ERROR ?   "[ ERROR ]" : "[ FATAL ]";
-  GetStream() << ::std::endl << marker << " "
-              << FormatFileLocation(file, line).c_str() << ": ";
+  const char* const marker = severity == GTEST_INFO      ? "[  INFO ]"
+                             : severity == GTEST_WARNING ? "[WARNING]"
+                             : severity == GTEST_ERROR   ? "[ ERROR ]"
+                                                         : "[ FATAL ]";
+  GetStream() << ::std::endl
+              << marker << " " << FormatFileLocation(file, line).c_str()
+              << ": ";
 }
 
 // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program.
@@ -1078,27 +1035,26 @@ class CapturedStream {
  public:
   // The ctor redirects the stream to a temporary file.
   explicit CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) {
-# if GTEST_OS_WINDOWS
-    char temp_dir_path[MAX_PATH + 1] = { '\0' };  // NOLINT
-    char temp_file_path[MAX_PATH + 1] = { '\0' };  // NOLINT
+#if GTEST_OS_WINDOWS
+    char temp_dir_path[MAX_PATH + 1] = {'\0'};   // NOLINT
+    char temp_file_path[MAX_PATH + 1] = {'\0'};  // NOLINT
 
     ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path);
-    const UINT success = ::GetTempFileNameA(temp_dir_path,
-                                            "gtest_redir",
+    const UINT success = ::GetTempFileNameA(temp_dir_path, "gtest_redir",
                                             0,  // Generate unique file name.
                                             temp_file_path);
     GTEST_CHECK_(success != 0)
         << "Unable to create a temporary file in " << temp_dir_path;
     const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE);
-    GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file "
-                                    << temp_file_path;
+    GTEST_CHECK_(captured_fd != -1)
+        << "Unable to open temporary file " << temp_file_path;
     filename_ = temp_file_path;
-# else
+#else
     // There's no guarantee that a test has write access to the current
     // directory, so we create the temporary file in a temporary directory.
     std::string name_template;
 
-#  if GTEST_OS_LINUX_ANDROID
+#if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
     // the location of the world-writable SD Card directory. However,
@@ -1111,7 +1067,7 @@ class CapturedStream {
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
     name_template = "/data/local/tmp/";
-#  elif GTEST_OS_IOS
+#elif GTEST_OS_IOS
     char user_temp_dir[PATH_MAX + 1];
 
     // Documented alternative to NSTemporaryDirectory() (for obtaining creating
@@ -1132,9 +1088,9 @@ class CapturedStream {
     name_template = user_temp_dir;
     if (name_template.back() != GTEST_PATH_SEP_[0])
       name_template.push_back(GTEST_PATH_SEP_[0]);
-#  else
+#else
     name_template = "/tmp/";
-#  endif
+#endif
     name_template.append("gtest_captured_stream.XXXXXX");
 
     // mkstemp() modifies the string bytes in place, and does not go beyond the
@@ -1150,15 +1106,13 @@ class CapturedStream {
           << " for test; does the test have access to the /tmp directory?";
     }
     filename_ = std::move(name_template);
-# endif  // GTEST_OS_WINDOWS
+#endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
     close(captured_fd);
   }
 
-  ~CapturedStream() {
-    remove(filename_.c_str());
-  }
+  ~CapturedStream() { remove(filename_.c_str()); }
 
   std::string GetCapturedString() {
     if (uncaptured_fd_ != -1) {
@@ -1185,7 +1139,8 @@ class CapturedStream {
   // Name of the temporary file holding the stderr output.
   ::std::string filename_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream);
+  CapturedStream(const CapturedStream&) = delete;
+  CapturedStream& operator=(const CapturedStream&) = delete;
 };
 
 GTEST_DISABLE_MSC_DEPRECATED_POP_()
@@ -1213,6 +1168,15 @@ static std::string GetCapturedStream(CapturedStream** captured_stream) {
   return content;
 }
 
+#if defined(_MSC_VER) || defined(__BORLANDC__)
+// MSVC and C++Builder do not provide a definition of STDERR_FILENO.
+const int kStdOutFileno = 1;
+const int kStdErrFileno = 2;
+#else
+const int kStdOutFileno = STDOUT_FILENO;
+const int kStdErrFileno = STDERR_FILENO;
+#endif  // defined(_MSC_VER) || defined(__BORLANDC__)
+
 // Starts capturing stdout.
 void CaptureStdout() {
   CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout);
@@ -1235,10 +1199,6 @@ std::string GetCapturedStderr() {
 
 #endif  // GTEST_HAS_STREAM_REDIRECTION
 
-
-
-
-
 size_t GetFileSize(FILE* file) {
   fseek(file, 0, SEEK_END);
   return static_cast<size_t>(ftell(file));
@@ -1256,7 +1216,8 @@ std::string ReadEntireFile(FILE* file) {
   // Keeps reading the file until we cannot read further or the
   // pre-determined file size is reached.
   do {
-    bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file);
+    bytes_last_read =
+        fread(buffer + bytes_read, 1, file_size - bytes_read, file);
     bytes_read += bytes_last_read;
   } while (bytes_last_read > 0 && bytes_read < file_size);
 
@@ -1344,7 +1305,7 @@ bool ParseInt32(const Message& src_text, const char* str, int32_t* value) {
       // LONG_MAX or LONG_MIN when the input overflows.)
       result != long_value
       // The parsed value overflows as an int32_t.
-      ) {
+  ) {
     Message msg;
     msg << "WARNING: " << src_text
         << " is expected to be a 32-bit integer, but actually"
@@ -1388,8 +1349,8 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
   }
 
   int32_t result = default_value;
-  if (!ParseInt32(Message() << "Environment variable " << env_var,
-                  string_value, &result)) {
+  if (!ParseInt32(Message() << "Environment variable " << env_var, string_value,
+                  &result)) {
     printf("The default value %s is used.\n",
            (Message() << default_value).GetString().c_str());
     fflush(stdout);
@@ -1408,7 +1369,7 @@ int32_t Int32FromGTestEnv(const char* flag, int32_t default_value) {
 // not check that the flag is 'output'
 // In essence this checks an env variable called XML_OUTPUT_FILE
 // and if it is set we prepend "xml:" to its value, if it not set we return ""
-std::string OutputFlagAlsoCheckEnvVar(){
+std::string OutputFlagAlsoCheckEnvVar() {
   std::string default_value_for_output_flag = "";
   const char* xml_output_file_env = posix::GetEnv("XML_OUTPUT_FILE");
   if (nullptr != xml_output_file_env) {
diff --git a/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libvpx/third_party/googletest/src/src/gtest-printers.cc
index 1b68fcb50..f3976d230 100644
--- a/libvpx/third_party/googletest/src/src/gtest-printers.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-printers.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 // Google Test - The Google C++ Testing and Mocking Framework
 //
 // This file implements a universal value printer that can print a
@@ -101,7 +100,7 @@ void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count,
     PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os);
     *os << " ... ";
     // Rounds up to 2-byte boundary.
-    const size_t resume_pos = (count - kChunkSize + 1)/2*2;
+    const size_t resume_pos = (count - kChunkSize + 1) / 2 * 2;
     PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os);
   }
   *os << ">";
@@ -136,11 +135,7 @@ void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count,
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
 //   - as a hexadecimal escape sequence (e.g. '\x7F'), or
 //   - as a special escape sequence (e.g. '\r', '\n').
-enum CharFormat {
-  kAsIs,
-  kHexEscape,
-  kSpecialEscape
-};
+enum CharFormat { kAsIs, kHexEscape, kSpecialEscape };
 
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
@@ -213,35 +208,21 @@ static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   }
 }
 
-static const char* GetCharWidthPrefix(char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(char) { return ""; }
 
-static const char* GetCharWidthPrefix(signed char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(signed char) { return ""; }
 
-static const char* GetCharWidthPrefix(unsigned char) {
-  return "";
-}
+static const char* GetCharWidthPrefix(unsigned char) { return ""; }
 
 #ifdef __cpp_char8_t
-static const char* GetCharWidthPrefix(char8_t) {
-  return "u8";
-}
+static const char* GetCharWidthPrefix(char8_t) { return "u8"; }
 #endif
 
-static const char* GetCharWidthPrefix(char16_t) {
-  return "u";
-}
+static const char* GetCharWidthPrefix(char16_t) { return "u"; }
 
-static const char* GetCharWidthPrefix(char32_t) {
-  return "U";
-}
+static const char* GetCharWidthPrefix(char32_t) { return "U"; }
 
-static const char* GetCharWidthPrefix(wchar_t) {
-  return "L";
-}
+static const char* GetCharWidthPrefix(wchar_t) { return "L"; }
 
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
@@ -276,8 +257,7 @@ void PrintCharAndCodeTo(Char c, ostream* os) {
   // To aid user debugging, we also print c's code in decimal, unless
   // it's 0 (in which case c was printed as '\\0', making the code
   // obvious).
-  if (c == 0)
-    return;
+  if (c == 0) return;
   *os << " (" << static_cast<int>(c);
 
   // For more convenience, we print c's code again in hexadecimal,
@@ -304,17 +284,60 @@ void PrintTo(char32_t c, ::std::ostream* os) {
       << static_cast<uint32_t>(c);
 }
 
+// gcc/clang __{u,}int128_t
+#if defined(__SIZEOF_INT128__)
+void PrintTo(__uint128_t v, ::std::ostream* os) {
+  if (v == 0) {
+    *os << "0";
+    return;
+  }
+
+  // Buffer large enough for ceil(log10(2^128))==39 and the null terminator
+  char buf[40];
+  char* p = buf + sizeof(buf);
+
+  // Some configurations have a __uint128_t, but no support for built in
+  // division. Do manual long division instead.
+
+  uint64_t high = static_cast<uint64_t>(v >> 64);
+  uint64_t low = static_cast<uint64_t>(v);
+
+  *--p = 0;
+  while (high != 0 || low != 0) {
+    uint64_t high_mod = high % 10;
+    high = high / 10;
+    // This is the long division algorithm specialized for a divisor of 10 and
+    // only two elements.
+    // Notable values:
+    //   2^64 / 10 == 1844674407370955161
+    //   2^64 % 10 == 6
+    const uint64_t carry = 6 * high_mod + low % 10;
+    low = low / 10 + high_mod * 1844674407370955161 + carry / 10;
+
+    char digit = static_cast<char>(carry % 10);
+    *--p = '0' + digit;
+  }
+  *os << p;
+}
+void PrintTo(__int128_t v, ::std::ostream* os) {
+  __uint128_t uv = static_cast<__uint128_t>(v);
+  if (v < 0) {
+    *os << "-";
+    uv = -uv;
+  }
+  PrintTo(uv, os);
+}
+#endif  // __SIZEOF_INT128__
+
 // Prints the given array of characters to the ostream.  CharType must be either
 // char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static CharFormat PrintCharsAsStringTo(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static CharFormat
+        PrintCharsAsStringTo(const CharType* begin, size_t len, ostream* os) {
   const char* const quote_prefix = GetCharWidthPrefix(*begin);
   *os << quote_prefix << "\"";
   bool is_previous_hex = false;
@@ -340,12 +363,11 @@ static CharFormat PrintCharsAsStringTo(
 // Prints a (const) char/wchar_t array of 'len' elements, starting at address
 // 'begin'.  CharType must be either char or wchar_t.
 template <typename CharType>
-GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_
-GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
-GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
-static void UniversalPrintCharArray(
-    const CharType* begin, size_t len, ostream* os) {
+GTEST_ATTRIBUTE_NO_SANITIZE_MEMORY_ GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
+    GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
+        GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_ static void
+        UniversalPrintCharArray(const CharType* begin, size_t len,
+                                ostream* os) {
   // The code
   //   const char kFoo[] = "foo";
   // generates an array of 4, not 3, elements, with the last one being '\0'.
@@ -436,28 +458,28 @@ void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 namespace {
 
 bool ContainsUnprintableControlCodes(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length; i++) {
     unsigned char ch = *s++;
     if (std::iscntrl(ch)) {
-        switch (ch) {
+      switch (ch) {
         case '\t':
         case '\n':
         case '\r':
           break;
         default:
           return true;
-        }
       }
+    }
   }
   return false;
 }
 
-bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t<= 0xbf; }
+bool IsUTF8TrailByte(unsigned char t) { return 0x80 <= t && t <= 0xbf; }
 
 bool IsValidUTF8(const char* str, size_t length) {
-  const unsigned char *s = reinterpret_cast<const unsigned char *>(str);
+  const unsigned char* s = reinterpret_cast<const unsigned char*>(str);
 
   for (size_t i = 0; i < length;) {
     unsigned char lead = s[i++];
@@ -470,15 +492,13 @@ bool IsValidUTF8(const char* str, size_t length) {
     } else if (lead <= 0xdf && (i + 1) <= length && IsUTF8TrailByte(s[i])) {
       ++i;  // 2-byte character
     } else if (0xe0 <= lead && lead <= 0xef && (i + 2) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                // check for non-shortest form and surrogate
                (lead != 0xe0 || s[i] >= 0xa0) &&
                (lead != 0xed || s[i] < 0xa0)) {
       i += 2;  // 3-byte character
     } else if (0xf0 <= lead && lead <= 0xf4 && (i + 3) <= length &&
-               IsUTF8TrailByte(s[i]) &&
-               IsUTF8TrailByte(s[i + 1]) &&
+               IsUTF8TrailByte(s[i]) && IsUTF8TrailByte(s[i + 1]) &&
                IsUTF8TrailByte(s[i + 2]) &&
                // check for non-shortest form
                (lead != 0xf0 || s[i] >= 0x90) &&
@@ -502,7 +522,7 @@ void ConditionalPrintAsText(const char* str, size_t length, ostream* os) {
 
 void PrintStringTo(const ::std::string& s, ostream* os) {
   if (PrintCharsAsStringTo(s.data(), s.size(), os) == kHexEscape) {
-    if (GTEST_FLAG(print_utf8)) {
+    if (GTEST_FLAG_GET(print_utf8)) {
       ConditionalPrintAsText(s.data(), s.size(), os);
     }
   }
diff --git a/libvpx/third_party/googletest/src/src/gtest-test-part.cc b/libvpx/third_party/googletest/src/src/gtest-test-part.cc
index a938683ce..eb7c8d1cf 100644
--- a/libvpx/third_party/googletest/src/src/gtest-test-part.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-test-part.cc
@@ -51,13 +51,11 @@ std::ostream& operator<<(std::ostream& os, const TestPartResult& result) {
   return os << internal::FormatFileLocation(result.file_name(),
                                             result.line_number())
             << " "
-            << (result.type() == TestPartResult::kSuccess
-                    ? "Success"
-                    : result.type() == TestPartResult::kSkip
-                          ? "Skipped"
-                          : result.type() == TestPartResult::kFatalFailure
-                                ? "Fatal failure"
-                                : "Non-fatal failure")
+            << (result.type() == TestPartResult::kSuccess ? "Success"
+                : result.type() == TestPartResult::kSkip  ? "Skipped"
+                : result.type() == TestPartResult::kFatalFailure
+                    ? "Fatal failure"
+                    : "Non-fatal failure")
             << ":\n"
             << result.message() << std::endl;
 }
@@ -86,8 +84,8 @@ namespace internal {
 
 HasNewFatalFailureHelper::HasNewFatalFailureHelper()
     : has_new_fatal_failure_(false),
-      original_reporter_(GetUnitTestImpl()->
-                         GetTestPartResultReporterForCurrentThread()) {
+      original_reporter_(
+          GetUnitTestImpl()->GetTestPartResultReporterForCurrentThread()) {
   GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this);
 }
 
@@ -98,8 +96,7 @@ HasNewFatalFailureHelper::~HasNewFatalFailureHelper() {
 
 void HasNewFatalFailureHelper::ReportTestPartResult(
     const TestPartResult& result) {
-  if (result.fatally_failed())
-    has_new_fatal_failure_ = true;
+  if (result.fatally_failed()) has_new_fatal_failure_ = true;
   original_reporter_->ReportTestPartResult(result);
 }
 
diff --git a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
index c02c3df65..a2828b83c 100644
--- a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
@@ -27,7 +27,6 @@
 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-
 #include "gtest/gtest-typed-test.h"
 
 #include "gtest/gtest.h"
@@ -38,8 +37,7 @@ namespace internal {
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
-  while (IsSpace(*str))
-    str++;
+  while (IsSpace(*str)) str++;
   return str;
 }
 
@@ -85,8 +83,7 @@ const char* TypedTestSuitePState::VerifyRegisteredTestNames(
   }
 
   for (RegisteredTestIter it = registered_tests_.begin();
-       it != registered_tests_.end();
-       ++it) {
+       it != registered_tests_.end(); ++it) {
     if (tests.count(it->first) == 0) {
       errors << "You forgot to list test " << it->first << ".\n";
     }
diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc
index 21c611aff..6f31dd226 100644
--- a/libvpx/third_party/googletest/src/src/gtest.cc
+++ b/libvpx/third_party/googletest/src/src/gtest.cc
@@ -31,8 +31,6 @@
 // The Google C++ Testing and Mocking Framework (Google Test)
 
 #include "gtest/gtest.h"
-#include "gtest/internal/custom/gtest.h"
-#include "gtest/gtest-spi.h"
 
 #include <ctype.h>
 #include <stdarg.h>
@@ -46,79 +44,87 @@
 #include <chrono>  // NOLINT
 #include <cmath>
 #include <cstdint>
+#include <initializer_list>
 #include <iomanip>
+#include <iterator>
 #include <limits>
 #include <list>
 #include <map>
 #include <ostream>  // NOLINT
 #include <sstream>
+#include <unordered_set>
 #include <vector>
 
+#include "gtest/gtest-assertion-result.h"
+#include "gtest/gtest-spi.h"
+#include "gtest/internal/custom/gtest.h"
+
 #if GTEST_OS_LINUX
 
-# include <fcntl.h>  // NOLINT
-# include <limits.h>  // NOLINT
-# include <sched.h>  // NOLINT
+#include <fcntl.h>   // NOLINT
+#include <limits.h>  // NOLINT
+#include <sched.h>   // NOLINT
 // Declares vsnprintf().  This header is not available on Windows.
-# include <strings.h>  // NOLINT
-# include <sys/mman.h>  // NOLINT
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
-# include <string>
+#include <strings.h>   // NOLINT
+#include <sys/mman.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
+
+#include <string>
 
 #elif GTEST_OS_ZOS
-# include <sys/time.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
-# include <strings.h>  // NOLINT
+#include <strings.h>   // NOLINT
 
 #elif GTEST_OS_WINDOWS_MOBILE  // We are on Windows CE.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #elif GTEST_OS_WINDOWS  // We are on Windows proper.
 
-# include <windows.h>  // NOLINT
-# undef min
+#include <windows.h>  // NOLINT
+#undef min
 
 #ifdef _MSC_VER
-# include <crtdbg.h>  // NOLINT
+#include <crtdbg.h>  // NOLINT
 #endif
 
-# include <io.h>  // NOLINT
-# include <sys/timeb.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
-# include <sys/stat.h>  // NOLINT
+#include <io.h>         // NOLINT
+#include <sys/stat.h>   // NOLINT
+#include <sys/timeb.h>  // NOLINT
+#include <sys/types.h>  // NOLINT
 
-# if GTEST_OS_WINDOWS_MINGW
-#  include <sys/time.h>  // NOLINT
-# endif  // GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS_MINGW
+#include <sys/time.h>  // NOLINT
+#endif                 // GTEST_OS_WINDOWS_MINGW
 
 #else
 
 // cpplint thinks that the header is already included, so we want to
 // silence it.
-# include <sys/time.h>  // NOLINT
-# include <unistd.h>  // NOLINT
+#include <sys/time.h>  // NOLINT
+#include <unistd.h>    // NOLINT
 
 #endif  // GTEST_OS_LINUX
 
 #if GTEST_HAS_EXCEPTIONS
-# include <stdexcept>
+#include <stdexcept>
 #endif
 
 #if GTEST_CAN_STREAM_RESULTS_
-# include <arpa/inet.h>  // NOLINT
-# include <netdb.h>  // NOLINT
-# include <sys/socket.h>  // NOLINT
-# include <sys/types.h>  // NOLINT
+#include <arpa/inet.h>   // NOLINT
+#include <netdb.h>       // NOLINT
+#include <sys/socket.h>  // NOLINT
+#include <sys/types.h>   // NOLINT
 #endif
 
 #include "src/gtest-internal-inl.h"
 
 #if GTEST_OS_WINDOWS
-# define vsnprintf _vsnprintf
+#define vsnprintf _vsnprintf
 #endif  // GTEST_OS_WINDOWS
 
 #if GTEST_OS_MAC
@@ -131,7 +137,10 @@
 #include "absl/debugging/failure_signal_handler.h"
 #include "absl/debugging/stacktrace.h"
 #include "absl/debugging/symbolize.h"
+#include "absl/flags/parse.h"
+#include "absl/flags/usage.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #endif  // GTEST_HAS_ABSL
 
 namespace testing {
@@ -177,7 +186,7 @@ const char kStackTraceMarker[] = "\nStack trace:\n";
 // is specified on the command line.
 bool g_help_flag = false;
 
-// Utilty function to Open File for Writing
+// Utility function to Open File for Writing
 static FILE* OpenFileForWriting(const std::string& output_file) {
   FILE* fileout = nullptr;
   FilePath output_file_path(output_file);
@@ -216,28 +225,33 @@ static bool GetDefaultFailFast() {
   return false;
 }
 
+}  // namespace testing
+
 GTEST_DEFINE_bool_(
-    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    fail_fast,
+    testing::internal::BoolFromGTestEnv("fail_fast",
+                                        testing::GetDefaultFailFast()),
     "True if and only if a test failure should stop further test execution.");
 
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
-    internal::BoolFromGTestEnv("also_run_disabled_tests", false),
+    testing::internal::BoolFromGTestEnv("also_run_disabled_tests", false),
     "Run disabled tests too, in addition to the tests normally being run.");
 
 GTEST_DEFINE_bool_(
-    break_on_failure, internal::BoolFromGTestEnv("break_on_failure", false),
+    break_on_failure,
+    testing::internal::BoolFromGTestEnv("break_on_failure", false),
     "True if and only if a failed assertion should be a debugger "
     "break-point.");
 
 GTEST_DEFINE_bool_(catch_exceptions,
-                   internal::BoolFromGTestEnv("catch_exceptions", true),
+                   testing::internal::BoolFromGTestEnv("catch_exceptions",
+                                                       true),
                    "True if and only if " GTEST_NAME_
                    " should catch exceptions and treat them as test failures.");
 
 GTEST_DEFINE_string_(
-    color,
-    internal::StringFromGTestEnv("color", "auto"),
+    color, testing::internal::StringFromGTestEnv("color", "auto"),
     "Whether to use colors in the output.  Valid values: yes, no, "
     "and auto.  'auto' means to use colors if the output is "
     "being sent to a terminal and the TERM environment variable "
@@ -245,7 +259,8 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_string_(
     filter,
-    internal::StringFromGTestEnv("filter", GetDefaultFilter()),
+    testing::internal::StringFromGTestEnv("filter",
+                                          testing::GetDefaultFilter()),
     "A colon-separated list of glob (not regex) patterns "
     "for filtering the tests to run, optionally followed by a "
     "'-' and a : separated list of negative patterns (tests to "
@@ -254,13 +269,14 @@ GTEST_DEFINE_string_(
 
 GTEST_DEFINE_bool_(
     install_failure_signal_handler,
-    internal::BoolFromGTestEnv("install_failure_signal_handler", false),
-    "If true and supported on the current platform, " GTEST_NAME_ " should "
+    testing::internal::BoolFromGTestEnv("install_failure_signal_handler",
+                                        false),
+    "If true and supported on the current platform, " GTEST_NAME_
+    " should "
     "install a signal handler that dumps debugging information when fatal "
     "signals are raised.");
 
-GTEST_DEFINE_bool_(list_tests, false,
-                   "List all tests without running them.");
+GTEST_DEFINE_bool_(list_tests, false, "List all tests without running them.");
 
 // The net priority order after flag processing is thus:
 //   --gtest_output command line flag
@@ -269,8 +285,8 @@ GTEST_DEFINE_bool_(list_tests, false,
 //   ''
 GTEST_DEFINE_string_(
     output,
-    internal::StringFromGTestEnv("output",
-      internal::OutputFlagAlsoCheckEnvVar().c_str()),
+    testing::internal::StringFromGTestEnv(
+        "output", testing::internal::OutputFlagAlsoCheckEnvVar().c_str()),
     "A format (defaults to \"xml\" but can be specified to be \"json\"), "
     "optionally followed by a colon and an output file name or directory. "
     "A directory is indicated by a trailing pathname separator. "
@@ -281,65 +297,79 @@ GTEST_DEFINE_string_(
     "digits.");
 
 GTEST_DEFINE_bool_(
-    brief, internal::BoolFromGTestEnv("brief", false),
+    brief, testing::internal::BoolFromGTestEnv("brief", false),
     "True if only test failures should be displayed in text output.");
 
-GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
+GTEST_DEFINE_bool_(print_time,
+                   testing::internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
 
-GTEST_DEFINE_bool_(print_utf8, internal::BoolFromGTestEnv("print_utf8", true),
+GTEST_DEFINE_bool_(print_utf8,
+                   testing::internal::BoolFromGTestEnv("print_utf8", true),
                    "True if and only if " GTEST_NAME_
                    " prints UTF8 characters as text.");
 
 GTEST_DEFINE_int32_(
-    random_seed,
-    internal::Int32FromGTestEnv("random_seed", 0),
+    random_seed, testing::internal::Int32FromGTestEnv("random_seed", 0),
     "Random number seed to use when shuffling test orders.  Must be in range "
     "[1, 99999], or 0 to use a seed based on the current time.");
 
 GTEST_DEFINE_int32_(
-    repeat,
-    internal::Int32FromGTestEnv("repeat", 1),
+    repeat, testing::internal::Int32FromGTestEnv("repeat", 1),
     "How many times to repeat each test.  Specify a negative number "
     "for repeating forever.  Useful for shaking out flaky tests.");
 
+GTEST_DEFINE_bool_(
+    recreate_environments_when_repeating,
+    testing::internal::BoolFromGTestEnv("recreate_environments_when_repeating",
+                                        false),
+    "Controls whether global test environments are recreated for each repeat "
+    "of the tests. If set to false the global test environments are only set "
+    "up once, for the first iteration, and only torn down once, for the last. "
+    "Useful for shaking out flaky tests with stable, expensive test "
+    "environments. If --gtest_repeat is set to a negative number, meaning "
+    "there is no last run, the environments will always be recreated to avoid "
+    "leaks.");
+
 GTEST_DEFINE_bool_(show_internal_stack_frames, false,
                    "True if and only if " GTEST_NAME_
                    " should include internal stack frames when "
                    "printing test failure stack traces.");
 
-GTEST_DEFINE_bool_(shuffle, internal::BoolFromGTestEnv("shuffle", false),
+GTEST_DEFINE_bool_(shuffle,
+                   testing::internal::BoolFromGTestEnv("shuffle", false),
                    "True if and only if " GTEST_NAME_
                    " should randomize tests' order on every run.");
 
 GTEST_DEFINE_int32_(
     stack_trace_depth,
-    internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth),
+    testing::internal::Int32FromGTestEnv("stack_trace_depth",
+                                         testing::kMaxStackTraceDepth),
     "The maximum number of stack frames to print when an "
     "assertion fails.  The valid range is 0 through 100, inclusive.");
 
 GTEST_DEFINE_string_(
     stream_result_to,
-    internal::StringFromGTestEnv("stream_result_to", ""),
+    testing::internal::StringFromGTestEnv("stream_result_to", ""),
     "This flag specifies the host name and the port number on which to stream "
     "test results. Example: \"localhost:555\". The flag is effective only on "
     "Linux.");
 
 GTEST_DEFINE_bool_(
     throw_on_failure,
-    internal::BoolFromGTestEnv("throw_on_failure", false),
+    testing::internal::BoolFromGTestEnv("throw_on_failure", false),
     "When this flag is specified, a failed assertion will throw an exception "
     "if exceptions are enabled or exit the program with a non-zero code "
     "otherwise. For use with an external test framework.");
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 GTEST_DEFINE_string_(
-    flagfile,
-    internal::StringFromGTestEnv("flagfile", ""),
+    flagfile, testing::internal::StringFromGTestEnv("flagfile", ""),
     "This flag specifies the flagfile to read command-line flags from.");
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
 
+namespace testing {
 namespace internal {
 
 // Generates a random number from [0, range), using a Linear
@@ -348,10 +378,9 @@ namespace internal {
 uint32_t Random::Generate(uint32_t range) {
   // These constants are the same as are used in glibc's rand(3).
   // Use wider types than necessary to prevent unsigned overflow diagnostics.
-  state_ = static_cast<uint32_t>(1103515245ULL*state_ + 12345U) % kMaxRange;
+  state_ = static_cast<uint32_t>(1103515245ULL * state_ + 12345U) % kMaxRange;
 
-  GTEST_CHECK_(range > 0)
-      << "Cannot generate a number in the range [0, 0).";
+  GTEST_CHECK_(range > 0) << "Cannot generate a number in the range [0, 0).";
   GTEST_CHECK_(range <= kMaxRange)
       << "Generation of a number in [0, " << range << ") was requested, "
       << "but this can only generate numbers in [0, " << kMaxRange << ").";
@@ -396,32 +425,26 @@ static bool ShouldRunTestSuite(const TestSuite* test_suite) {
 }
 
 // AssertHelper constructor.
-AssertHelper::AssertHelper(TestPartResult::Type type,
-                           const char* file,
-                           int line,
-                           const char* message)
-    : data_(new AssertHelperData(type, file, line, message)) {
-}
+AssertHelper::AssertHelper(TestPartResult::Type type, const char* file,
+                           int line, const char* message)
+    : data_(new AssertHelperData(type, file, line, message)) {}
 
-AssertHelper::~AssertHelper() {
-  delete data_;
-}
+AssertHelper::~AssertHelper() { delete data_; }
 
 // Message assignment, for assertion streaming support.
 void AssertHelper::operator=(const Message& message) const {
-  UnitTest::GetInstance()->
-    AddTestPartResult(data_->type, data_->file, data_->line,
-                      AppendUserMessage(data_->message, message),
-                      UnitTest::GetInstance()->impl()
-                      ->CurrentOsStackTraceExceptTop(1)
-                      // Skips the stack frame for this function itself.
-                      );  // NOLINT
+  UnitTest::GetInstance()->AddTestPartResult(
+      data_->type, data_->file, data_->line,
+      AppendUserMessage(data_->message, message),
+      UnitTest::GetInstance()->impl()->CurrentOsStackTraceExceptTop(1)
+      // Skips the stack frame for this function itself.
+  );  // NOLINT
 }
 
 namespace {
 
 // When TEST_P is found without a matching INSTANTIATE_TEST_SUITE_P
-// to creates test cases for it, a syntetic test case is
+// to creates test cases for it, a synthetic test case is
 // inserted to report ether an error or a log message.
 //
 // This configuration bit will likely be removed at some point.
@@ -452,7 +475,6 @@ class FailureTest : public Test {
   const bool as_error_;
 };
 
-
 }  // namespace
 
 std::set<std::string>* GetIgnoredParameterizedTestSuites() {
@@ -496,7 +518,8 @@ void InsertSyntheticTestCase(const std::string& name, CodeLocation location,
       "To suppress this error for this test suite, insert the following line "
       "(in a non-header) in the namespace it is defined in:"
       "\n\n"
-      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
+      "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
+      name + ");";
 
   std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
@@ -516,19 +539,18 @@ void RegisterTypeParameterizedTestSuite(const char* test_suite_name,
 }
 
 void RegisterTypeParameterizedTestSuiteInstantiation(const char* case_name) {
-  GetUnitTestImpl()
-      ->type_parameterized_test_registry()
-      .RegisterInstantiation(case_name);
+  GetUnitTestImpl()->type_parameterized_test_registry().RegisterInstantiation(
+      case_name);
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterTestSuite(
     const char* test_suite_name, CodeLocation code_location) {
   suites_.emplace(std::string(test_suite_name),
-                 TypeParameterizedTestSuiteInfo(code_location));
+                  TypeParameterizedTestSuiteInfo(code_location));
 }
 
 void TypeParameterizedTestSuiteRegistry::RegisterInstantiation(
-        const char* test_suite_name) {
+    const char* test_suite_name) {
   auto it = suites_.find(std::string(test_suite_name));
   if (it != suites_.end()) {
     it->second.instantiated = true;
@@ -606,7 +628,8 @@ FilePath GetCurrentExecutableName() {
 
 // Returns the output format, or "" for normal printed output.
 std::string UnitTestOptions::GetOutputFormat() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
   const char* const colon = strchr(gtest_output_flag, ':');
   return (colon == nullptr)
              ? std::string(gtest_output_flag)
@@ -617,19 +640,19 @@ std::string UnitTestOptions::GetOutputFormat() {
 // Returns the name of the requested output file, or the default if none
 // was explicitly specified.
 std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
-  const char* const gtest_output_flag = GTEST_FLAG(output).c_str();
+  std::string s = GTEST_FLAG_GET(output);
+  const char* const gtest_output_flag = s.c_str();
 
   std::string format = GetOutputFormat();
-  if (format.empty())
-    format = std::string(kDefaultOutputFormat);
+  if (format.empty()) format = std::string(kDefaultOutputFormat);
 
   const char* const colon = strchr(gtest_output_flag, ':');
   if (colon == nullptr)
     return internal::FilePath::MakeFileName(
-        internal::FilePath(
-            UnitTest::GetInstance()->original_working_dir()),
-        internal::FilePath(kDefaultOutputFile), 0,
-        format.c_str()).string();
+               internal::FilePath(
+                   UnitTest::GetInstance()->original_working_dir()),
+               internal::FilePath(kDefaultOutputFile), 0, format.c_str())
+        .string();
 
   internal::FilePath output_name(colon + 1);
   if (!output_name.IsAbsolutePath())
@@ -637,8 +660,7 @@ std::string UnitTestOptions::GetAbsolutePathToOutputFile() {
         internal::FilePath(UnitTest::GetInstance()->original_working_dir()),
         internal::FilePath(colon + 1));
 
-  if (!output_name.IsDirectory())
-    return output_name.string();
+  if (!output_name.IsDirectory()) return output_name.string();
 
   internal::FilePath result(internal::FilePath::GenerateUniqueFileName(
       output_name, internal::GetCurrentExecutableName(),
@@ -699,59 +721,119 @@ static bool PatternMatchesString(const std::string& name_str,
   return true;
 }
 
-bool UnitTestOptions::MatchesFilter(const std::string& name_str,
-                                    const char* filter) {
-  // The filter is a list of patterns separated by colons (:).
-  const char* pattern = filter;
-  while (true) {
-    // Find the bounds of this pattern.
-    const char* const next_sep = strchr(pattern, ':');
-    const char* const pattern_end =
-        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
-
-    // Check if this pattern matches name_str.
-    if (PatternMatchesString(name_str, pattern, pattern_end)) {
-      return true;
-    }
+namespace {
+
+bool IsGlobPattern(const std::string& pattern) {
+  return std::any_of(pattern.begin(), pattern.end(),
+                     [](const char c) { return c == '?' || c == '*'; });
+}
+
+class UnitTestFilter {
+ public:
+  UnitTestFilter() = default;
+
+  // Constructs a filter from a string of patterns separated by `:`.
+  explicit UnitTestFilter(const std::string& filter) {
+    // By design "" filter matches "" string.
+    std::vector<std::string> all_patterns;
+    SplitString(filter, ':', &all_patterns);
+    const auto exact_match_patterns_begin = std::partition(
+        all_patterns.begin(), all_patterns.end(), &IsGlobPattern);
+
+    glob_patterns_.reserve(static_cast<size_t>(
+        std::distance(all_patterns.begin(), exact_match_patterns_begin)));
+    std::move(all_patterns.begin(), exact_match_patterns_begin,
+              std::inserter(glob_patterns_, glob_patterns_.begin()));
+    std::move(
+        exact_match_patterns_begin, all_patterns.end(),
+        std::inserter(exact_match_patterns_, exact_match_patterns_.begin()));
+  }
+
+  // Returns true if and only if name matches at least one of the patterns in
+  // the filter.
+  bool MatchesName(const std::string& name) const {
+    return exact_match_patterns_.count(name) > 0 ||
+           std::any_of(glob_patterns_.begin(), glob_patterns_.end(),
+                       [&name](const std::string& pattern) {
+                         return PatternMatchesString(
+                             name, pattern.c_str(),
+                             pattern.c_str() + pattern.size());
+                       });
+  }
+
+ private:
+  std::vector<std::string> glob_patterns_;
+  std::unordered_set<std::string> exact_match_patterns_;
+};
 
-    // Give up on this pattern. However, if we found a pattern separator (:),
-    // advance to the next pattern (skipping over the separator) and restart.
-    if (next_sep == nullptr) {
-      return false;
+class PositiveAndNegativeUnitTestFilter {
+ public:
+  // Constructs a positive and a negative filter from a string. The string
+  // contains a positive filter optionally followed by a '-' character and a
+  // negative filter. In case only a negative filter is provided the positive
+  // filter will be assumed "*".
+  // A filter is a list of patterns separated by ':'.
+  explicit PositiveAndNegativeUnitTestFilter(const std::string& filter) {
+    std::vector<std::string> positive_and_negative_filters;
+
+    // NOTE: `SplitString` always returns a non-empty container.
+    SplitString(filter, '-', &positive_and_negative_filters);
+    const auto& positive_filter = positive_and_negative_filters.front();
+
+    if (positive_and_negative_filters.size() > 1) {
+      positive_filter_ = UnitTestFilter(
+          positive_filter.empty() ? kUniversalFilter : positive_filter);
+
+      // TODO(b/214626361): Fail on multiple '-' characters
+      // For the moment to preserve old behavior we concatenate the rest of the
+      // string parts with `-` as separator to generate the negative filter.
+      auto negative_filter_string = positive_and_negative_filters[1];
+      for (std::size_t i = 2; i < positive_and_negative_filters.size(); i++)
+        negative_filter_string =
+            negative_filter_string + '-' + positive_and_negative_filters[i];
+      negative_filter_ = UnitTestFilter(negative_filter_string);
+    } else {
+      // In case we don't have a negative filter and positive filter is ""
+      // we do not use kUniversalFilter by design as opposed to when we have a
+      // negative filter.
+      positive_filter_ = UnitTestFilter(positive_filter);
     }
-    pattern = next_sep + 1;
   }
-  return true;
+
+  // Returns true if and only if test name (this is generated by appending test
+  // suit name and test name via a '.' character) matches the positive filter
+  // and does not match the negative filter.
+  bool MatchesTest(const std::string& test_suite_name,
+                   const std::string& test_name) const {
+    return MatchesName(test_suite_name + "." + test_name);
+  }
+
+  // Returns true if and only if name matches the positive filter and does not
+  // match the negative filter.
+  bool MatchesName(const std::string& name) const {
+    return positive_filter_.MatchesName(name) &&
+           !negative_filter_.MatchesName(name);
+  }
+
+ private:
+  UnitTestFilter positive_filter_;
+  UnitTestFilter negative_filter_;
+};
+}  // namespace
+
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  return UnitTestFilter(filter).MatchesName(name_str);
 }
 
 // Returns true if and only if the user-specified filter matches the test
 // suite name and the test name.
 bool UnitTestOptions::FilterMatchesTest(const std::string& test_suite_name,
                                         const std::string& test_name) {
-  const std::string& full_name = test_suite_name + "." + test_name.c_str();
-
   // Split --gtest_filter at '-', if there is one, to separate into
   // positive filter and negative filter portions
-  const char* const p = GTEST_FLAG(filter).c_str();
-  const char* const dash = strchr(p, '-');
-  std::string positive;
-  std::string negative;
-  if (dash == nullptr) {
-    positive = GTEST_FLAG(filter).c_str();  // Whole string is a positive filter
-    negative = "";
-  } else {
-    positive = std::string(p, dash);   // Everything up to the dash
-    negative = std::string(dash + 1);  // Everything after the dash
-    if (positive.empty()) {
-      // Treat '-test1' as the same as '*-test1'
-      positive = kUniversalFilter;
-    }
-  }
-
-  // A filter is a colon-separated list of patterns.  It matches a
-  // test if any pattern in it matches the test.
-  return (MatchesFilter(full_name, positive.c_str()) &&
-          !MatchesFilter(full_name, negative.c_str()));
+  return PositiveAndNegativeUnitTestFilter(GTEST_FLAG_GET(filter))
+      .MatchesTest(test_suite_name, test_name);
 }
 
 #if GTEST_HAS_SEH
@@ -771,7 +853,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 
   bool should_handle = true;
 
-  if (!GTEST_FLAG(catch_exceptions))
+  if (!GTEST_FLAG_GET(catch_exceptions))
     should_handle = false;
   else if (exception_code == EXCEPTION_BREAKPOINT)
     should_handle = false;
@@ -789,8 +871,7 @@ int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) {
 // results. Intercepts only failures from the current thread.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     TestPartResultArray* result)
-    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD),
-      result_(result) {
+    : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), result_(result) {
   Init();
 }
 
@@ -799,8 +880,7 @@ ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
 // results.
 ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter(
     InterceptMode intercept_mode, TestPartResultArray* result)
-    : intercept_mode_(intercept_mode),
-      result_(result) {
+    : intercept_mode_(intercept_mode), result_(result) {
   Init();
 }
 
@@ -844,9 +924,7 @@ namespace internal {
 // from user test code.  GetTestTypeId() is guaranteed to always
 // return the same value, as it always calls GetTypeId<>() from the
 // gtest.cc, which is within the Google Test framework.
-TypeId GetTestTypeId() {
-  return GetTypeId<Test>();
-}
+TypeId GetTestTypeId() { return GetTypeId<Test>(); }
 
 // The value of GetTestTypeId() as seen from within the Google Test
 // library.  This is solely for testing GetTestTypeId().
@@ -861,9 +939,9 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
                                      const TestPartResultArray& results,
                                      TestPartResult::Type type,
                                      const std::string& substr) {
-  const std::string expected(type == TestPartResult::kFatalFailure ?
-                        "1 fatal failure" :
-                        "1 non-fatal failure");
+  const std::string expected(type == TestPartResult::kFatalFailure
+                                 ? "1 fatal failure"
+                                 : "1 non-fatal failure");
   Message msg;
   if (results.size() != 1) {
     msg << "Expected: " << expected << "\n"
@@ -882,10 +960,10 @@ static AssertionResult HasOneFailure(const char* /* results_expr */,
   }
 
   if (strstr(r.message(), substr.c_str()) == nullptr) {
-    return AssertionFailure() << "Expected: " << expected << " containing \""
-                              << substr << "\"\n"
-                              << "  Actual:\n"
-                              << r;
+    return AssertionFailure()
+           << "Expected: " << expected << " containing \"" << substr << "\"\n"
+           << "  Actual:\n"
+           << r;
   }
 
   return AssertionSuccess();
@@ -908,7 +986,8 @@ SingleFailureChecker::~SingleFailureChecker() {
 }
 
 DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -917,7 +996,8 @@ void DefaultGlobalTestPartResultReporter::ReportTestPartResult(
 }
 
 DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter(
-    UnitTestImpl* unit_test) : unit_test_(unit_test) {}
+    UnitTestImpl* unit_test)
+    : unit_test_(unit_test) {}
 
 void DefaultPerThreadTestPartResultReporter::ReportTestPartResult(
     const TestPartResult& result) {
@@ -1024,11 +1104,10 @@ int UnitTestImpl::test_to_run_count() const {
 // trace but Bar() and CurrentOsStackTraceExceptTop() won't.
 std::string UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) {
   return os_stack_trace_getter()->CurrentStackTrace(
-      static_cast<int>(GTEST_FLAG(stack_trace_depth)),
-      skip_count + 1
+      static_cast<int>(GTEST_FLAG_GET(stack_trace_depth)), skip_count + 1
       // Skips the user-specified number of frames plus this function
       // itself.
-      );  // NOLINT
+  );  // NOLINT
 }
 
 // A helper class for measuring elapsed times.
@@ -1072,8 +1151,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
   const int unicode_length =
       MultiByteToWideChar(CP_ACP, 0, ansi, length, nullptr, 0);
   WCHAR* unicode = new WCHAR[unicode_length + 1];
-  MultiByteToWideChar(CP_ACP, 0, ansi, length,
-                      unicode, unicode_length);
+  MultiByteToWideChar(CP_ACP, 0, ansi, length, unicode, unicode_length);
   unicode[unicode_length] = 0;
   return unicode;
 }
@@ -1082,7 +1160,7 @@ LPCWSTR String::AnsiToUtf16(const char* ansi) {
 // memory using new. The caller is responsible for deleting the return
 // value using delete[]. Returns the ANSI string, or NULL if the
 // input is NULL.
-const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
+const char* String::Utf16ToAnsi(LPCWSTR utf16_str) {
   if (!utf16_str) return nullptr;
   const int ansi_length = WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, nullptr,
                                               0, nullptr, nullptr);
@@ -1101,7 +1179,7 @@ const char* String::Utf16ToAnsi(LPCWSTR utf16_str)  {
 // Unlike strcmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CStringEquals(const char * lhs, const char * rhs) {
+bool String::CStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -1115,11 +1193,10 @@ bool String::CStringEquals(const char * lhs, const char * rhs) {
 // encoding, and streams the result to the given Message object.
 static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length,
                                      Message* msg) {
-  for (size_t i = 0; i != length; ) {  // NOLINT
+  for (size_t i = 0; i != length;) {  // NOLINT
     if (wstr[i] != L'\0') {
       *msg << WideStringToUtf8(wstr + i, static_cast<int>(length - i));
-      while (i != length && wstr[i] != L'\0')
-        i++;
+      while (i != length && wstr[i] != L'\0') i++;
     } else {
       *msg << '\0';
       i++;
@@ -1161,17 +1238,17 @@ Message::Message() : ss_(new ::std::stringstream) {
 
 // These two overloads allow streaming a wide C string to a Message
 // using the UTF-8 encoding.
-Message& Message::operator <<(const wchar_t* wide_c_str) {
+Message& Message::operator<<(const wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
-Message& Message::operator <<(wchar_t* wide_c_str) {
+Message& Message::operator<<(wchar_t* wide_c_str) {
   return *this << internal::String::ShowWideCString(wide_c_str);
 }
 
 #if GTEST_HAS_STD_WSTRING
 // Converts the given wide string to a narrow string using the UTF-8
 // encoding, and streams the result to this Message object.
-Message& Message::operator <<(const ::std::wstring& wstr) {
+Message& Message::operator<<(const ::std::wstring& wstr) {
   internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this);
   return *this;
 }
@@ -1183,44 +1260,6 @@ std::string Message::GetString() const {
   return internal::StringStreamToString(ss_.get());
 }
 
-// AssertionResult constructors.
-// Used in EXPECT_TRUE/FALSE(assertion_result).
-AssertionResult::AssertionResult(const AssertionResult& other)
-    : success_(other.success_),
-      message_(other.message_.get() != nullptr
-                   ? new ::std::string(*other.message_)
-                   : static_cast< ::std::string*>(nullptr)) {}
-
-// Swaps two AssertionResults.
-void AssertionResult::swap(AssertionResult& other) {
-  using std::swap;
-  swap(success_, other.success_);
-  swap(message_, other.message_);
-}
-
-// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE.
-AssertionResult AssertionResult::operator!() const {
-  AssertionResult negation(!success_);
-  if (message_.get() != nullptr) negation << *message_;
-  return negation;
-}
-
-// Makes a successful assertion result.
-AssertionResult AssertionSuccess() {
-  return AssertionResult(true);
-}
-
-// Makes a failed assertion result.
-AssertionResult AssertionFailure() {
-  return AssertionResult(false);
-}
-
-// Makes a failed assertion result with the given failure message.
-// Deprecated; use AssertionFailure() << message.
-AssertionResult AssertionFailure(const Message& message) {
-  return AssertionFailure() << message;
-}
-
 namespace internal {
 
 namespace edit_distance {
@@ -1512,8 +1551,7 @@ std::vector<std::string> SplitEscapedString(const std::string& str) {
 AssertionResult EqFailure(const char* lhs_expression,
                           const char* rhs_expression,
                           const std::string& lhs_value,
-                          const std::string& rhs_value,
-                          bool ignoring_case) {
+                          const std::string& rhs_value, bool ignoring_case) {
   Message msg;
   msg << "Expected equality of these values:";
   msg << "\n  " << lhs_expression;
@@ -1530,10 +1568,8 @@ AssertionResult EqFailure(const char* lhs_expression,
   }
 
   if (!lhs_value.empty() && !rhs_value.empty()) {
-    const std::vector<std::string> lhs_lines =
-        SplitEscapedString(lhs_value);
-    const std::vector<std::string> rhs_lines =
-        SplitEscapedString(rhs_value);
+    const std::vector<std::string> lhs_lines = SplitEscapedString(lhs_value);
+    const std::vector<std::string> rhs_lines = SplitEscapedString(rhs_value);
     if (lhs_lines.size() > 1 || rhs_lines.size() > 1) {
       msg << "\nWith diff:\n"
           << edit_distance::CreateUnifiedDiff(lhs_lines, rhs_lines);
@@ -1545,27 +1581,21 @@ AssertionResult EqFailure(const char* lhs_expression,
 
 // Constructs a failure message for Boolean assertions such as EXPECT_TRUE.
 std::string GetBoolAssertionFailureMessage(
-    const AssertionResult& assertion_result,
-    const char* expression_text,
-    const char* actual_predicate_value,
-    const char* expected_predicate_value) {
+    const AssertionResult& assertion_result, const char* expression_text,
+    const char* actual_predicate_value, const char* expected_predicate_value) {
   const char* actual_message = assertion_result.message();
   Message msg;
   msg << "Value of: " << expression_text
       << "\n  Actual: " << actual_predicate_value;
-  if (actual_message[0] != '\0')
-    msg << " (" << actual_message << ")";
+  if (actual_message[0] != '\0') msg << " (" << actual_message << ")";
   msg << "\nExpected: " << expected_predicate_value;
   return msg.GetString();
 }
 
 // Helper function for implementing ASSERT_NEAR.
-AssertionResult DoubleNearPredFormat(const char* expr1,
-                                     const char* expr2,
-                                     const char* abs_error_expr,
-                                     double val1,
-                                     double val2,
-                                     double abs_error) {
+AssertionResult DoubleNearPredFormat(const char* expr1, const char* expr2,
+                                     const char* abs_error_expr, double val1,
+                                     double val2, double abs_error) {
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
@@ -1595,20 +1625,17 @@ AssertionResult DoubleNearPredFormat(const char* expr1,
               "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
   }
   return AssertionFailure()
-      << "The difference between " << expr1 << " and " << expr2
-      << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
-      << expr1 << " evaluates to " << val1 << ",\n"
-      << expr2 << " evaluates to " << val2 << ", and\n"
-      << abs_error_expr << " evaluates to " << abs_error << ".";
+         << "The difference between " << expr1 << " and " << expr2 << " is "
+         << diff << ", which exceeds " << abs_error_expr << ", where\n"
+         << expr1 << " evaluates to " << val1 << ",\n"
+         << expr2 << " evaluates to " << val2 << ", and\n"
+         << abs_error_expr << " evaluates to " << abs_error << ".";
 }
 
-
 // Helper template for implementing FloatLE() and DoubleLE().
 template <typename RawType>
-AssertionResult FloatingPointLE(const char* expr1,
-                                const char* expr2,
-                                RawType val1,
-                                RawType val2) {
+AssertionResult FloatingPointLE(const char* expr1, const char* expr2,
+                                RawType val1, RawType val2) {
   // Returns success if val1 is less than val2,
   if (val1 < val2) {
     return AssertionSuccess();
@@ -1633,24 +1660,24 @@ AssertionResult FloatingPointLE(const char* expr1,
           << val2;
 
   return AssertionFailure()
-      << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
-      << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
-      << StringStreamToString(&val2_ss);
+         << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n"
+         << "  Actual: " << StringStreamToString(&val1_ss) << " vs "
+         << StringStreamToString(&val2_ss);
 }
 
 }  // namespace internal
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult FloatLE(const char* expr1, const char* expr2,
-                        float val1, float val2) {
+AssertionResult FloatLE(const char* expr1, const char* expr2, float val1,
+                        float val2) {
   return internal::FloatingPointLE<float>(expr1, expr2, val1, val2);
 }
 
 // Asserts that val1 is less than, or almost equal to, val2.  Fails
 // otherwise.  In particular, it fails if either val1 or val2 is NaN.
-AssertionResult DoubleLE(const char* expr1, const char* expr2,
-                         double val1, double val2) {
+AssertionResult DoubleLE(const char* expr1, const char* expr2, double val1,
+                         double val2) {
   return internal::FloatingPointLE<double>(expr1, expr2, val1, val2);
 }
 
@@ -1658,62 +1685,51 @@ namespace internal {
 
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const char* lhs,
+                               const char* rhs_expression, const char* lhs,
                                const char* rhs) {
   if (String::CStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASEEQ.
 AssertionResult CmpHelperSTRCASEEQ(const char* lhs_expression,
-                                   const char* rhs_expression,
-                                   const char* lhs,
+                                   const char* rhs_expression, const char* lhs,
                                    const char* rhs) {
   if (String::CaseInsensitiveCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   true);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), true);
 }
 
 // The helper function for {ASSERT|EXPECT}_STRNE.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const char* s1,
+                               const char* s2_expression, const char* s1,
                                const char* s2) {
   if (!String::CStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
-    return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                              << s2_expression << "), actual: \""
-                              << s1 << "\" vs \"" << s2 << "\"";
+    return AssertionFailure()
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << "), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
 // The helper function for {ASSERT|EXPECT}_STRCASENE.
 AssertionResult CmpHelperSTRCASENE(const char* s1_expression,
-                                   const char* s2_expression,
-                                   const char* s1,
+                                   const char* s2_expression, const char* s1,
                                    const char* s2) {
   if (!String::CaseInsensitiveCStringEquals(s1, s2)) {
     return AssertionSuccess();
   } else {
     return AssertionFailure()
-        << "Expected: (" << s1_expression << ") != ("
-        << s2_expression << ") (ignoring case), actual: \""
-        << s1 << "\" vs \"" << s2 << "\"";
+           << "Expected: (" << s1_expression << ") != (" << s2_expression
+           << ") (ignoring case), actual: \"" << s1 << "\" vs \"" << s2 << "\"";
   }
 }
 
@@ -1741,8 +1757,7 @@ bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) {
 
 // StringType here can be either ::std::string or ::std::wstring.
 template <typename StringType>
-bool IsSubstringPred(const StringType& needle,
-                     const StringType& haystack) {
+bool IsSubstringPred(const StringType& needle, const StringType& haystack) {
   return haystack.find(needle) != StringType::npos;
 }
 
@@ -1751,21 +1766,22 @@ bool IsSubstringPred(const StringType& needle,
 // StringType here can be const char*, const wchar_t*, ::std::string,
 // or ::std::wstring.
 template <typename StringType>
-AssertionResult IsSubstringImpl(
-    bool expected_to_be_substring,
-    const char* needle_expr, const char* haystack_expr,
-    const StringType& needle, const StringType& haystack) {
+AssertionResult IsSubstringImpl(bool expected_to_be_substring,
+                                const char* needle_expr,
+                                const char* haystack_expr,
+                                const StringType& needle,
+                                const StringType& haystack) {
   if (IsSubstringPred(needle, haystack) == expected_to_be_substring)
     return AssertionSuccess();
 
   const bool is_wide_string = sizeof(needle[0]) > 1;
   const char* const begin_string_quote = is_wide_string ? "L\"" : "\"";
   return AssertionFailure()
-      << "Value of: " << needle_expr << "\n"
-      << "  Actual: " << begin_string_quote << needle << "\"\n"
-      << "Expected: " << (expected_to_be_substring ? "" : "not ")
-      << "a substring of " << haystack_expr << "\n"
-      << "Which is: " << begin_string_quote << haystack << "\"";
+         << "Value of: " << needle_expr << "\n"
+         << "  Actual: " << begin_string_quote << needle << "\"\n"
+         << "Expected: " << (expected_to_be_substring ? "" : "not ")
+         << "a substring of " << haystack_expr << "\n"
+         << "Which is: " << begin_string_quote << haystack << "\"";
 }
 
 }  // namespace
@@ -1774,52 +1790,52 @@ AssertionResult IsSubstringImpl(
 // substring of haystack (NULL is considered a substring of itself
 // only), and return an appropriate error message when they fail.
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const char* needle, const char* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const wchar_t* needle, const wchar_t* haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const char* needle, const char* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const char* needle,
+                               const char* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const wchar_t* needle, const wchar_t* haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr, const wchar_t* needle,
+                               const wchar_t* haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::string& needle,
+                            const ::std::string& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::string& needle, const ::std::string& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::string& needle,
+                               const ::std::string& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 
 #if GTEST_HAS_STD_WSTRING
-AssertionResult IsSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsSubstring(const char* needle_expr, const char* haystack_expr,
+                            const ::std::wstring& needle,
+                            const ::std::wstring& haystack) {
   return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack);
 }
 
-AssertionResult IsNotSubstring(
-    const char* needle_expr, const char* haystack_expr,
-    const ::std::wstring& needle, const ::std::wstring& haystack) {
+AssertionResult IsNotSubstring(const char* needle_expr,
+                               const char* haystack_expr,
+                               const ::std::wstring& needle,
+                               const ::std::wstring& haystack) {
   return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack);
 }
 #endif  // GTEST_HAS_STD_WSTRING
@@ -1831,43 +1847,42 @@ namespace internal {
 namespace {
 
 // Helper function for IsHRESULT{SuccessFailure} predicates
-AssertionResult HRESULTFailureHelper(const char* expr,
-                                     const char* expected,
+AssertionResult HRESULTFailureHelper(const char* expr, const char* expected,
                                      long hr) {  // NOLINT
-# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_TV_TITLE
 
   // Windows CE doesn't support FormatMessage.
   const char error_text[] = "";
 
-# else
+#else
 
   // Looks up the human-readable system message for the HRESULT code
   // and since we're not passing any params to FormatMessage, we don't
   // want inserts expanded.
-  const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM |
-                       FORMAT_MESSAGE_IGNORE_INSERTS;
+  const DWORD kFlags =
+      FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS;
   const DWORD kBufSize = 4096;
   // Gets the system's human readable message string for this HRESULT.
-  char error_text[kBufSize] = { '\0' };
+  char error_text[kBufSize] = {'\0'};
   DWORD message_length = ::FormatMessageA(kFlags,
-                                          0,   // no source, we're asking system
+                                          0,  // no source, we're asking system
                                           static_cast<DWORD>(hr),  // the error
-                                          0,   // no line width restrictions
+                                          0,  // no line width restrictions
                                           error_text,  // output buffer
                                           kBufSize,    // buf size
                                           nullptr);  // no arguments for inserts
   // Trims tailing white space (FormatMessage leaves a trailing CR-LF)
   for (; message_length && IsSpace(error_text[message_length - 1]);
-          --message_length) {
+       --message_length) {
     error_text[message_length - 1] = '\0';
   }
 
-# endif  // GTEST_OS_WINDOWS_MOBILE
+#endif  // GTEST_OS_WINDOWS_MOBILE
 
   const std::string error_hex("0x" + String::FormatHexInt(hr));
   return ::testing::AssertionFailure()
-      << "Expected: " << expr << " " << expected << ".\n"
-      << "  Actual: " << error_hex << " " << error_text << "\n";
+         << "Expected: " << expr << " " << expected << ".\n"
+         << "  Actual: " << error_hex << " " << error_text << "\n";
 }
 
 }  // namespace
@@ -1901,16 +1916,18 @@ AssertionResult IsHRESULTFailure(const char* expr, long hr) {  // NOLINT
 //  17 - 21 bits       11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 // The maximum code-point a one-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) <<  7) - 1;
+constexpr uint32_t kMaxCodePoint1 = (static_cast<uint32_t>(1) << 7) - 1;
 
 // The maximum code-point a two-byte UTF-8 sequence can represent.
 constexpr uint32_t kMaxCodePoint2 = (static_cast<uint32_t>(1) << (5 + 6)) - 1;
 
 // The maximum code-point a three-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint3 = (static_cast<uint32_t>(1) << (4 + 2*6)) - 1;
+constexpr uint32_t kMaxCodePoint3 =
+    (static_cast<uint32_t>(1) << (4 + 2 * 6)) - 1;
 
 // The maximum code-point a four-byte UTF-8 sequence can represent.
-constexpr uint32_t kMaxCodePoint4 = (static_cast<uint32_t>(1) << (3 + 3*6)) - 1;
+constexpr uint32_t kMaxCodePoint4 =
+    (static_cast<uint32_t>(1) << (3 + 3 * 6)) - 1;
 
 // Chops off the n lowest bits from a bit pattern.  Returns the n
 // lowest bits.  As a side effect, the original bit pattern will be
@@ -1935,7 +1952,7 @@ std::string CodePointToUtf8(uint32_t code_point) {
   char str[5];  // Big enough for the largest valid code point.
   if (code_point <= kMaxCodePoint1) {
     str[1] = '\0';
-    str[0] = static_cast<char>(code_point);                          // 0xxxxxxx
+    str[0] = static_cast<char>(code_point);  // 0xxxxxxx
   } else if (code_point <= kMaxCodePoint2) {
     str[2] = '\0';
     str[1] = static_cast<char>(0x80 | ChopLowBits(&code_point, 6));  // 10xxxxxx
@@ -1963,8 +1980,8 @@ std::string CodePointToUtf8(uint32_t code_point) {
 // and thus should be combined into a single Unicode code point
 // using CreateCodePointFromUtf16SurrogatePair.
 inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) {
-  return sizeof(wchar_t) == 2 &&
-      (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00;
+  return sizeof(wchar_t) == 2 && (first & 0xFC00) == 0xD800 &&
+         (second & 0xFC00) == 0xDC00;
 }
 
 // Creates a Unicode code point from UTF16 surrogate pair.
@@ -1995,8 +2012,7 @@ inline uint32_t CreateCodePointFromUtf16SurrogatePair(wchar_t first,
 // and contains invalid UTF-16 surrogate pairs, values in those pairs
 // will be encoded as individual Unicode characters from Basic Normal Plane.
 std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
-  if (num_chars == -1)
-    num_chars = static_cast<int>(wcslen(str));
+  if (num_chars == -1) num_chars = static_cast<int>(wcslen(str));
 
   ::std::stringstream stream;
   for (int i = 0; i < num_chars; ++i) {
@@ -2005,8 +2021,8 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
     if (str[i] == L'\0') {
       break;
     } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) {
-      unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i],
-                                                                 str[i + 1]);
+      unicode_code_point =
+          CreateCodePointFromUtf16SurrogatePair(str[i], str[i + 1]);
       i++;
     } else {
       unicode_code_point = static_cast<uint32_t>(str[i]);
@@ -2019,7 +2035,7 @@ std::string WideStringToUtf8(const wchar_t* str, int num_chars) {
 
 // Converts a wide C string to an std::string using the UTF-8 encoding.
 // NULL will be converted to "(null)".
-std::string String::ShowWideCString(const wchar_t * wide_c_str) {
+std::string String::ShowWideCString(const wchar_t* wide_c_str) {
   if (wide_c_str == nullptr) return "(null)";
 
   return internal::WideStringToUtf8(wide_c_str, -1);
@@ -2031,7 +2047,7 @@ std::string String::ShowWideCString(const wchar_t * wide_c_str) {
 // Unlike wcscmp(), this function can handle NULL argument(s).  A NULL
 // C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
+bool String::WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
 
   if (rhs == nullptr) return false;
@@ -2041,33 +2057,27 @@ bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) {
 
 // Helper function for *_STREQ on wide strings.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
-                               const char* rhs_expression,
-                               const wchar_t* lhs,
+                               const char* rhs_expression, const wchar_t* lhs,
                                const wchar_t* rhs) {
   if (String::WideCStringEquals(lhs, rhs)) {
     return AssertionSuccess();
   }
 
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   PrintToString(lhs),
-                   PrintToString(rhs),
-                   false);
+  return EqFailure(lhs_expression, rhs_expression, PrintToString(lhs),
+                   PrintToString(rhs), false);
 }
 
 // Helper function for *_STRNE on wide strings.
 AssertionResult CmpHelperSTRNE(const char* s1_expression,
-                               const char* s2_expression,
-                               const wchar_t* s1,
+                               const char* s2_expression, const wchar_t* s1,
                                const wchar_t* s2) {
   if (!String::WideCStringEquals(s1, s2)) {
     return AssertionSuccess();
   }
 
-  return AssertionFailure() << "Expected: (" << s1_expression << ") != ("
-                            << s2_expression << "), actual: "
-                            << PrintToString(s1)
-                            << " vs " << PrintToString(s2);
+  return AssertionFailure()
+         << "Expected: (" << s1_expression << ") != (" << s2_expression
+         << "), actual: " << PrintToString(s1) << " vs " << PrintToString(s2);
 }
 
 // Compares two C strings, ignoring case.  Returns true if and only if they have
@@ -2076,7 +2086,7 @@ AssertionResult CmpHelperSTRNE(const char* s1_expression,
 // Unlike strcasecmp(), this function can handle NULL argument(s).  A
 // NULL C string is considered different to any non-NULL C string,
 // including the empty string.
-bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) {
+bool String::CaseInsensitiveCStringEquals(const char* lhs, const char* rhs) {
   if (lhs == nullptr) return rhs == nullptr;
   if (rhs == nullptr) return false;
   return posix::StrCaseCmp(lhs, rhs) == 0;
@@ -2118,8 +2128,8 @@ bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs,
 
 // Returns true if and only if str ends with the given suffix, ignoring case.
 // Any string is considered to end with an empty suffix.
-bool String::EndsWithCaseInsensitive(
-    const std::string& str, const std::string& suffix) {
+bool String::EndsWithCaseInsensitive(const std::string& str,
+                                     const std::string& suffix) {
   const size_t str_len = str.length();
   const size_t suffix_len = suffix.length();
   return (str_len >= suffix_len) &&
@@ -2202,15 +2212,13 @@ TestResult::TestResult()
     : death_test_count_(0), start_timestamp_(0), elapsed_time_(0) {}
 
 // D'tor.
-TestResult::~TestResult() {
-}
+TestResult::~TestResult() {}
 
 // Returns the i-th test part result among all the results. i can
 // range from 0 to total_part_count() - 1. If i is not in that range,
 // aborts the program.
 const TestPartResult& TestResult::GetTestPartResult(int i) const {
-  if (i < 0 || i >= total_part_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= total_part_count()) internal::posix::Abort();
   return test_part_results_.at(static_cast<size_t>(i));
 }
 
@@ -2218,15 +2226,12 @@ const TestPartResult& TestResult::GetTestPartResult(int i) const {
 // test_property_count() - 1. If i is not in that range, aborts the
 // program.
 const TestProperty& TestResult::GetTestProperty(int i) const {
-  if (i < 0 || i >= test_property_count())
-    internal::posix::Abort();
+  if (i < 0 || i >= test_property_count()) internal::posix::Abort();
   return test_properties_.at(static_cast<size_t>(i));
 }
 
 // Clears the test part results.
-void TestResult::ClearTestPartResults() {
-  test_part_results_.clear();
-}
+void TestResult::ClearTestPartResults() { test_part_results_.clear(); }
 
 // Adds a test part result to the list.
 void TestResult::AddTestPartResult(const TestPartResult& test_part_result) {
@@ -2255,15 +2260,8 @@ void TestResult::RecordProperty(const std::string& xml_element,
 // The list of reserved attributes used in the <testsuites> element of XML
 // output.
 static const char* const kReservedTestSuitesAttributes[] = {
-  "disabled",
-  "errors",
-  "failures",
-  "name",
-  "random_seed",
-  "tests",
-  "time",
-  "timestamp"
-};
+    "disabled",    "errors", "failures", "name",
+    "random_seed", "tests",  "time",     "timestamp"};
 
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
@@ -2273,8 +2271,8 @@ static const char* const kReservedTestSuiteAttributes[] = {
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
-    "classname",   "name", "status", "time",  "type_param",
-    "value_param", "file", "line"};
+    "classname",  "name",        "status", "time",
+    "type_param", "value_param", "file",   "line"};
 
 // Use a slightly different set for allowed output to ensure existing tests can
 // still RecordProperty("result") or "RecordProperty(timestamp")
@@ -2336,7 +2334,7 @@ static bool ValidateTestPropertyName(
     const std::string& property_name,
     const std::vector<std::string>& reserved_names) {
   if (std::find(reserved_names.begin(), reserved_names.end(), property_name) !=
-          reserved_names.end()) {
+      reserved_names.end()) {
     ADD_FAILURE() << "Reserved key used in RecordProperty(): " << property_name
                   << " (" << FormatWordList(reserved_names)
                   << " are reserved by " << GTEST_NAME_ << ")";
@@ -2374,8 +2372,7 @@ bool TestResult::Skipped() const {
 // Returns true if and only if the test failed.
 bool TestResult::Failed() const {
   for (int i = 0; i < total_part_count(); ++i) {
-    if (GetTestPartResult(i).failed())
-      return true;
+    if (GetTestPartResult(i).failed()) return true;
   }
   return false;
 }
@@ -2416,27 +2413,22 @@ int TestResult::test_property_count() const {
 // Creates a Test object.
 
 // The c'tor saves the states of all flags.
-Test::Test()
-    : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {
-}
+Test::Test() : gtest_flag_saver_(new GTEST_FLAG_SAVER_) {}
 
 // The d'tor restores the states of all flags.  The actual work is
 // done by the d'tor of the gtest_flag_saver_ field, and thus not
 // visible here.
-Test::~Test() {
-}
+Test::~Test() {}
 
 // Sets up the test fixture.
 //
 // A sub-class may override this.
-void Test::SetUp() {
-}
+void Test::SetUp() {}
 
 // Tears down the test fixture.
 //
 // A sub-class may override this.
-void Test::TearDown() {
-}
+void Test::TearDown() {}
 
 // Allows user supplied key value pairs to be recorded for later output.
 void Test::RecordProperty(const std::string& key, const std::string& value) {
@@ -2541,8 +2533,8 @@ bool Test::HasSameFixtureClass() {
 static std::string* FormatSehExceptionMessage(DWORD exception_code,
                                               const char* location) {
   Message message;
-  message << "SEH exception with code 0x" << std::setbase(16) <<
-    exception_code << std::setbase(10) << " thrown in " << location << ".";
+  message << "SEH exception with code 0x" << std::setbase(16) << exception_code
+          << std::setbase(10) << " thrown in " << location << ".";
 
   return new std::string(message.GetString());
 }
@@ -2585,8 +2577,8 @@ GoogleTestFailureException::GoogleTestFailureException(
 // exceptions in the same function.  Therefore, we provide a separate
 // wrapper function for handling SEH exceptions.)
 template <class T, typename Result>
-Result HandleSehExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleSehExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                              const char* location) {
 #if GTEST_HAS_SEH
   __try {
     return (object->*method)();
@@ -2595,8 +2587,8 @@ Result HandleSehExceptionsInMethodIfSupported(
     // We create the exception message on the heap because VC++ prohibits
     // creation of objects with destructors on stack in functions using __try
     // (see error C2712).
-    std::string* exception_message = FormatSehExceptionMessage(
-        GetExceptionCode(), location);
+    std::string* exception_message =
+        FormatSehExceptionMessage(GetExceptionCode(), location);
     internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure,
                                              *exception_message);
     delete exception_message;
@@ -2612,8 +2604,8 @@ Result HandleSehExceptionsInMethodIfSupported(
 // exceptions, if they are supported; returns the 0-value for type
 // Result in case of an SEH exception.
 template <class T, typename Result>
-Result HandleExceptionsInMethodIfSupported(
-    T* object, Result (T::*method)(), const char* location) {
+Result HandleExceptionsInMethodIfSupported(T* object, Result (T::*method)(),
+                                           const char* location) {
   // NOTE: The user code can affect the way in which Google Test handles
   // exceptions by setting GTEST_FLAG(catch_exceptions), but only before
   // RUN_ALL_TESTS() starts. It is technically possible to check the flag
@@ -2623,7 +2615,7 @@ Result HandleExceptionsInMethodIfSupported(
   // try {
   //   // Perform the test method.
   // } catch (...) {
-  //   if (GTEST_FLAG(catch_exceptions))
+  //   if (GTEST_FLAG_GET(catch_exceptions))
   //     // Report the exception as failure.
   //   else
   //     throw;  // Re-throws the original exception.
@@ -2679,16 +2671,16 @@ void Test::Run() {
   // GTEST_SKIP().
   if (!HasFatalFailure() && !IsSkipped()) {
     impl->os_stack_trace_getter()->UponLeavingGTest();
-    internal::HandleExceptionsInMethodIfSupported(
-        this, &Test::TestBody, "the test body");
+    internal::HandleExceptionsInMethodIfSupported(this, &Test::TestBody,
+                                                  "the test body");
   }
 
   // However, we want to clean up as much as possible.  Hence we will
   // always call TearDown(), even if SetUp() or the test body has
   // failed.
   impl->os_stack_trace_getter()->UponLeavingGTest();
-  internal::HandleExceptionsInMethodIfSupported(
-      this, &Test::TearDown, "TearDown()");
+  internal::HandleExceptionsInMethodIfSupported(this, &Test::TearDown,
+                                                "TearDown()");
 }
 
 // Returns true if and only if the current test has a fatal failure.
@@ -2698,8 +2690,9 @@ bool Test::HasFatalFailure() {
 
 // Returns true if and only if the current test has a non-fatal failure.
 bool Test::HasNonfatalFailure() {
-  return internal::GetUnitTestImpl()->current_test_result()->
-      HasNonfatalFailure();
+  return internal::GetUnitTestImpl()
+      ->current_test_result()
+      ->HasNonfatalFailure();
 }
 
 // Returns true if and only if the current test was skipped.
@@ -2799,11 +2792,10 @@ class TestNameIs {
   // Constructor.
   //
   // TestNameIs has NO default constructor.
-  explicit TestNameIs(const char* name)
-      : name_(name) {}
+  explicit TestNameIs(const char* name) : name_(name) {}
 
   // Returns true if and only if the test name of test_info matches name_.
-  bool operator()(const TestInfo * test_info) const {
+  bool operator()(const TestInfo* test_info) const {
     return test_info && test_info->name() == name_;
   }
 
@@ -2831,20 +2823,20 @@ void UnitTestImpl::RegisterParameterizedTests() {
 // Creates the test object, runs it, records its result, and then
 // deletes it.
 void TestInfo::Run() {
-  if (!should_run_) return;
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+  if (!should_run_) {
+    if (is_disabled_ && matches_filter_) repeater->OnTestDisabled(*this);
+    return;
+  }
 
   // Tells UnitTest where to store test result.
   internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
   impl->set_current_test_info(this);
 
-  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
-
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
-
   result_.set_start_timestamp(internal::GetTimeInMillis());
   internal::Timer timer;
-
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
   // Creates the test object.
@@ -3009,11 +3001,18 @@ void TestSuite::Run() {
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
+  const bool skip_all = ad_hoc_test_result().Failed();
+
   start_timestamp_ = internal::GetTimeInMillis();
   internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
-    GetMutableTestInfo(i)->Run();
-    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+    if (skip_all) {
+      GetMutableTestInfo(i)->Skip();
+    } else {
+      GetMutableTestInfo(i)->Run();
+    }
+    if (GTEST_FLAG_GET(fail_fast) &&
+        GetMutableTestInfo(i)->result()->Failed()) {
       for (int j = i + 1; j < total_test_count(); j++) {
         GetMutableTestInfo(j)->Skip();
       }
@@ -3089,11 +3088,10 @@ void TestSuite::UnshuffleTests() {
 //
 // FormatCountableNoun(1, "formula", "formuli") returns "1 formula".
 // FormatCountableNoun(5, "book", "books") returns "5 books".
-static std::string FormatCountableNoun(int count,
-                                       const char * singular_form,
-                                       const char * plural_form) {
+static std::string FormatCountableNoun(int count, const char* singular_form,
+                                       const char* plural_form) {
   return internal::StreamableToString(count) + " " +
-      (count == 1 ? singular_form : plural_form);
+         (count == 1 ? singular_form : plural_form);
 }
 
 // Formats the count of tests.
@@ -3110,7 +3108,7 @@ static std::string FormatTestSuiteCount(int test_suite_count) {
 // representation.  Both kNonFatalFailure and kFatalFailure are translated
 // to "Failure", as the user usually doesn't care about the difference
 // between the two when viewing the test result.
-static const char * TestPartResultTypeToString(TestPartResult::Type type) {
+static const char* TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
       return "Skipped\n";
@@ -3137,17 +3135,18 @@ enum class GTestColor { kDefault, kRed, kGreen, kYellow };
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
     const TestPartResult& test_part_result) {
-  return (Message()
-          << internal::FormatFileLocation(test_part_result.file_name(),
-                                          test_part_result.line_number())
-          << " " << TestPartResultTypeToString(test_part_result.type())
-          << test_part_result.message()).GetString();
+  return (Message() << internal::FormatFileLocation(
+                           test_part_result.file_name(),
+                           test_part_result.line_number())
+                    << " "
+                    << TestPartResultTypeToString(test_part_result.type())
+                    << test_part_result.message())
+      .GetString();
 }
 
 // Prints a TestPartResult.
 static void PrintTestPartResult(const TestPartResult& test_part_result) {
-  const std::string& result =
-      PrintTestPartResultToString(test_part_result);
+  const std::string& result = PrintTestPartResultToString(test_part_result);
   printf("%s\n", result.c_str());
   fflush(stdout);
   // If the test program runs in Visual Studio or a debugger, the
@@ -3164,8 +3163,8 @@ static void PrintTestPartResult(const TestPartResult& test_part_result) {
 }
 
 // class PrettyUnitTestResultPrinter
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
 
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
@@ -3176,7 +3175,8 @@ static WORD GetColorAttribute(GTestColor color) {
       return FOREGROUND_GREEN;
     case GTestColor::kYellow:
       return FOREGROUND_RED | FOREGROUND_GREEN;
-    default:           return 0;
+    default:
+      return 0;
   }
 }
 
@@ -3232,7 +3232,8 @@ static const char* GetAnsiColorCode(GTestColor color) {
 
 // Returns true if and only if Google Test should use colors in the output.
 bool ShouldUseColor(bool stdout_is_tty) {
-  const char* const gtest_color = GTEST_FLAG(color).c_str();
+  std::string c = GTEST_FLAG_GET(color);
+  const char* const gtest_color = c.c_str();
 
   if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
@@ -3259,9 +3260,9 @@ bool ShouldUseColor(bool stdout_is_tty) {
   }
 
   return String::CaseInsensitiveCStringEquals(gtest_color, "yes") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
-      String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
-      String::CStringEquals(gtest_color, "1");
+         String::CaseInsensitiveCStringEquals(gtest_color, "true") ||
+         String::CaseInsensitiveCStringEquals(gtest_color, "t") ||
+         String::CStringEquals(gtest_color, "1");
   // We take "yes", "true", "t", and "1" as meaning "yes".  If the
   // value is neither one of these nor "auto", we treat it as "no" to
   // be conservative.
@@ -3273,18 +3274,13 @@ bool ShouldUseColor(bool stdout_is_tty) {
 // that would be colored when printed, as can be done on Linux.
 
 GTEST_ATTRIBUTE_PRINTF_(2, 3)
-static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
+static void ColoredPrintf(GTestColor color, const char* fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS || GTEST_OS_IOS || \
-    GTEST_OS_WINDOWS_PHONE || GTEST_OS_WINDOWS_RT || defined(ESP_PLATFORM)
-  const bool use_color = AlwaysFalse();
-#else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
   const bool use_color = in_color_mode && (color != GTestColor::kDefault);
-#endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
     vprintf(fmt, args);
@@ -3292,8 +3288,8 @@ static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
     return;
   }
 
-#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && \
-    !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_WINDOWS_MINGW
   const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE);
 
   // Gets the current text color.
@@ -3364,6 +3360,7 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3384,13 +3381,14 @@ class PrettyUnitTestResultPrinter : public TestEventListener {
   static void PrintSkippedTests(const UnitTest& unit_test);
 };
 
-  // Fired before each iteration of tests starts.
+// Fired before each iteration of tests starts.
 void PrettyUnitTestResultPrinter::OnTestIterationStart(
     const UnitTest& unit_test, int iteration) {
-  if (GTEST_FLAG(repeat) != 1)
+  if (GTEST_FLAG_GET(repeat) != 1)
     printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1);
 
-  const char* const filter = GTEST_FLAG(filter).c_str();
+  std::string f = GTEST_FLAG_GET(filter);
+  const char* const filter = f.c_str();
 
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
@@ -3406,7 +3404,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationStart(
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
@@ -3462,6 +3460,13 @@ void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
   fflush(stdout);
 }
 
+void PrettyUnitTestResultPrinter::OnTestDisabled(const TestInfo& test_info) {
+  ColoredPrintf(GTestColor::kYellow, "[ DISABLED ] ");
+  PrintTestName(test_info.test_suite_name(), test_info.name());
+  printf("\n");
+  fflush(stdout);
+}
+
 // Called after an assertion failure.
 void PrettyUnitTestResultPrinter::OnTestPartResult(
     const TestPartResult& result) {
@@ -3486,12 +3491,12 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
-  if (test_info.result()->Failed())
-    PrintFullTestCommentIfPresent(test_info);
+  if (test_info.result()->Failed()) PrintFullTestCommentIfPresent(test_info);
 
-  if (GTEST_FLAG(print_time)) {
-    printf(" (%s ms)\n", internal::StreamableToString(
-           test_info.result()->elapsed_time()).c_str());
+  if (GTEST_FLAG_GET(print_time)) {
+    printf(" (%s ms)\n",
+           internal::StreamableToString(test_info.result()->elapsed_time())
+               .c_str());
   } else {
     printf("\n");
   }
@@ -3500,7 +3505,7 @@ void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
 
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
@@ -3511,7 +3516,7 @@ void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) {
 }
 #else
 void PrettyUnitTestResultPrinter::OnTestSuiteEnd(const TestSuite& test_suite) {
-  if (!GTEST_FLAG(print_time)) return;
+  if (!GTEST_FLAG_GET(print_time)) return;
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
@@ -3607,7 +3612,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3628,7 +3633,7 @@ void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3664,6 +3669,7 @@ class BriefUnitTestResultPrinter : public TestEventListener {
 #endif  // OnTestCaseStart
 
   void OnTestStart(const TestInfo& /*test_info*/) override {}
+  void OnTestDisabled(const TestInfo& /*test_info*/) override {}
 
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
@@ -3700,7 +3706,7 @@ void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
     PrintTestName(test_info.test_suite_name(), test_info.name());
     PrintFullTestCommentIfPresent(test_info);
 
-    if (GTEST_FLAG(print_time)) {
+    if (GTEST_FLAG_GET(print_time)) {
       printf(" (%s ms)\n",
              internal::StreamableToString(test_info.result()->elapsed_time())
                  .c_str());
@@ -3717,7 +3723,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
-  if (GTEST_FLAG(print_time)) {
+  if (GTEST_FLAG_GET(print_time)) {
     printf(" (%s ms total)",
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
@@ -3732,7 +3738,7 @@ void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
   }
 
   int num_disabled = unit_test.reportable_disabled_test_count();
-  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+  if (num_disabled && !GTEST_FLAG_GET(also_run_disabled_tests)) {
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
@@ -3752,7 +3758,7 @@ class TestEventRepeater : public TestEventListener {
  public:
   TestEventRepeater() : forwarding_enabled_(true) {}
   ~TestEventRepeater() override;
-  void Append(TestEventListener *listener);
+  void Append(TestEventListener* listener);
   TestEventListener* Release(TestEventListener* listener);
 
   // Controls whether events will be forwarded to listeners_. Set to false
@@ -3770,6 +3776,7 @@ class TestEventRepeater : public TestEventListener {
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   void OnTestSuiteStart(const TestSuite& parameter) override;
   void OnTestStart(const TestInfo& test_info) override;
+  void OnTestDisabled(const TestInfo& test_info) override;
   void OnTestPartResult(const TestPartResult& result) override;
   void OnTestEnd(const TestInfo& test_info) override;
 //  Legacy API is deprecated but still available
@@ -3789,18 +3796,19 @@ class TestEventRepeater : public TestEventListener {
   // The list of listeners that receive events.
   std::vector<TestEventListener*> listeners_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater);
+  TestEventRepeater(const TestEventRepeater&) = delete;
+  TestEventRepeater& operator=(const TestEventRepeater&) = delete;
 };
 
 TestEventRepeater::~TestEventRepeater() {
   ForEach(listeners_, Delete<TestEventListener>);
 }
 
-void TestEventRepeater::Append(TestEventListener *listener) {
+void TestEventRepeater::Append(TestEventListener* listener) {
   listeners_.push_back(listener);
 }
 
-TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
+TestEventListener* TestEventRepeater::Release(TestEventListener* listener) {
   for (size_t i = 0; i < listeners_.size(); ++i) {
     if (listeners_[i] == listener) {
       listeners_.erase(listeners_.begin() + static_cast<int>(i));
@@ -3813,14 +3821,14 @@ TestEventListener* TestEventRepeater::Release(TestEventListener *listener) {
 
 // Since most methods are very similar, use macros to reduce boilerplate.
 // This defines a member that forwards the call to all listeners.
-#define GTEST_REPEATER_METHOD_(Name, Type) \
-void TestEventRepeater::Name(const Type& parameter) { \
-  if (forwarding_enabled_) { \
-    for (size_t i = 0; i < listeners_.size(); i++) { \
-      listeners_[i]->Name(parameter); \
-    } \
-  } \
-}
+#define GTEST_REPEATER_METHOD_(Name, Type)              \
+  void TestEventRepeater::Name(const Type& parameter) { \
+    if (forwarding_enabled_) {                          \
+      for (size_t i = 0; i < listeners_.size(); i++) {  \
+        listeners_[i]->Name(parameter);                 \
+      }                                                 \
+    }                                                   \
+  }
 // This defines a member that forwards the call to all listeners in reverse
 // order.
 #define GTEST_REVERSE_REPEATER_METHOD_(Name, Type)      \
@@ -3840,6 +3848,7 @@ GTEST_REPEATER_METHOD_(OnTestCaseStart, TestSuite)
 #endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 GTEST_REPEATER_METHOD_(OnTestSuiteStart, TestSuite)
 GTEST_REPEATER_METHOD_(OnTestStart, TestInfo)
+GTEST_REPEATER_METHOD_(OnTestDisabled, TestInfo)
 GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult)
 GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest)
 GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest)
@@ -3890,12 +3899,13 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
  private:
   // Is c a whitespace character that is normalized to a space character
   // when it appears in an XML attribute value?
-  static bool IsNormalizableWhitespace(char c) {
-    return c == 0x9 || c == 0xA || c == 0xD;
+  static bool IsNormalizableWhitespace(unsigned char c) {
+    return c == '\t' || c == '\n' || c == '\r';
   }
 
   // May c appear in a well-formed XML document?
-  static bool IsValidXmlCharacter(char c) {
+  // https://www.w3.org/TR/REC-xml/#charsets
+  static bool IsValidXmlCharacter(unsigned char c) {
     return IsNormalizableWhitespace(c) || c >= 0x20;
   }
 
@@ -3965,7 +3975,8 @@ class XmlUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter);
+  XmlUnitTestResultPrinter(const XmlUnitTestResultPrinter&) = delete;
+  XmlUnitTestResultPrinter& operator=(const XmlUnitTestResultPrinter&) = delete;
 };
 
 // Creates a new XmlUnitTestResultPrinter.
@@ -4005,8 +4016,8 @@ void XmlUnitTestResultPrinter::ListTestsMatchingFilter(
 // module will consist of ordinary English text.
 // If this module is ever modified to produce version 1.1 XML output,
 // most invalid characters can be retained using character references.
-std::string XmlUnitTestResultPrinter::EscapeXml(
-    const std::string& str, bool is_attribute) {
+std::string XmlUnitTestResultPrinter::EscapeXml(const std::string& str,
+                                                bool is_attribute) {
   Message m;
 
   for (size_t i = 0; i < str.size(); ++i) {
@@ -4034,8 +4045,9 @@ std::string XmlUnitTestResultPrinter::EscapeXml(
           m << '"';
         break;
       default:
-        if (IsValidXmlCharacter(ch)) {
-          if (is_attribute && IsNormalizableWhitespace(ch))
+        if (IsValidXmlCharacter(static_cast<unsigned char>(ch))) {
+          if (is_attribute &&
+              IsNormalizableWhitespace(static_cast<unsigned char>(ch)))
             m << "&#x" << String::FormatByte(static_cast<unsigned char>(ch))
               << ";";
           else
@@ -4056,7 +4068,7 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
   std::string output;
   output.reserve(str.size());
   for (std::string::const_iterator it = str.begin(); it != str.end(); ++it)
-    if (IsValidXmlCharacter(*it))
+    if (IsValidXmlCharacter(static_cast<unsigned char>(*it)))
       output.push_back(*it);
 
   return output;
@@ -4064,7 +4076,6 @@ std::string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(
 
 // The following routines generate an XML representation of a UnitTest
 // object.
-// GOOGLETEST_CM0009 DO NOT DELETE
 //
 // This is how Google Test concepts map to the DTD:
 //
@@ -4113,12 +4124,12 @@ std::string FormatEpochTimeInMillisAsIso8601(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "." +
-      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "." +
+         String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -4129,8 +4140,8 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
   for (;;) {
     const char* const next_segment = strstr(segment, "]]>");
     if (next_segment != nullptr) {
-      stream->write(
-          segment, static_cast<std::streamsize>(next_segment - segment));
+      stream->write(segment,
+                    static_cast<std::streamsize>(next_segment - segment));
       *stream << "]]>]]&gt;<![CDATA[";
       segment = next_segment + strlen("]]>");
     } else {
@@ -4142,15 +4153,13 @@ void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream,
 }
 
 void XmlUnitTestResultPrinter::OutputXmlAttribute(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, const std::string& value) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Attribute " << name << " is not allowed for element <" << element_name
       << ">.";
 
@@ -4216,10 +4225,11 @@ void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
     OutputXmlAttribute(stream, kTestsuite, "type_param",
                        test_info.type_param());
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
-    OutputXmlAttribute(stream, kTestsuite, "line",
-                       StreamableToString(test_info.line()));
+
+  OutputXmlAttribute(stream, kTestsuite, "file", test_info.file());
+  OutputXmlAttribute(stream, kTestsuite, "line",
+                     StreamableToString(test_info.line()));
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << " />\n";
     return;
   }
@@ -4254,8 +4264,7 @@ void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
-      *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary)
+      *stream << "      <failure message=\"" << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
@@ -4295,7 +4304,7 @@ void XmlUnitTestResultPrinter::PrintXmlTestSuite(std::ostream* stream,
   OutputXmlAttribute(stream, kTestsuite, "name", test_suite.name());
   OutputXmlAttribute(stream, kTestsuite, "tests",
                      StreamableToString(test_suite.reportable_test_count()));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputXmlAttribute(stream, kTestsuite, "failures",
                        StreamableToString(test_suite.failed_test_count()));
     OutputXmlAttribute(
@@ -4343,7 +4352,7 @@ void XmlUnitTestResultPrinter::PrintXmlUnitTest(std::ostream* stream,
       stream, kTestsuites, "timestamp",
       FormatEpochTimeInMillisAsIso8601(unit_test.start_timestamp()));
 
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputXmlAttribute(stream, kTestsuites, "random_seed",
                        StreamableToString(unit_test.random_seed()));
   }
@@ -4396,7 +4405,7 @@ std::string XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes(
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
     attributes << " " << property.key() << "="
-        << "\"" << EscapeXmlAttribute(property.value()) << "\"";
+               << "\"" << EscapeXmlAttribute(property.value()) << "\"";
   }
   return attributes.GetString();
 }
@@ -4410,15 +4419,15 @@ void XmlUnitTestResultPrinter::OutputXmlTestProperties(
     return;
   }
 
-  *stream << "<" << kProperties << ">\n";
+  *stream << "      <" << kProperties << ">\n";
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    *stream << "<" << kProperty;
+    *stream << "        <" << kProperty;
     *stream << " name=\"" << EscapeXmlAttribute(property.key()) << "\"";
     *stream << " value=\"" << EscapeXmlAttribute(property.value()) << "\"";
     *stream << "/>\n";
   }
-  *stream << "</" << kProperties << ">\n";
+  *stream << "      </" << kProperties << ">\n";
 }
 
 // End XmlUnitTestResultPrinter
@@ -4442,16 +4451,12 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   //// streams the attribute as JSON.
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            const std::string& value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, const std::string& value,
+                            const std::string& indent, bool comma = true);
   static void OutputJsonKey(std::ostream* stream,
                             const std::string& element_name,
-                            const std::string& name,
-                            int value,
-                            const std::string& indent,
-                            bool comma = true);
+                            const std::string& name, int value,
+                            const std::string& indent, bool comma = true);
 
   // Streams a test suite JSON stanza containing the given test result.
   //
@@ -4484,7 +4489,9 @@ class JsonUnitTestResultPrinter : public EmptyTestEventListener {
   // The output file.
   const std::string output_file_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(JsonUnitTestResultPrinter);
+  JsonUnitTestResultPrinter(const JsonUnitTestResultPrinter&) = delete;
+  JsonUnitTestResultPrinter& operator=(const JsonUnitTestResultPrinter&) =
+      delete;
 };
 
 // Creates a new JsonUnitTestResultPrinter.
@@ -4496,7 +4503,7 @@ JsonUnitTestResultPrinter::JsonUnitTestResultPrinter(const char* output_file)
 }
 
 void JsonUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
-                                                  int /*iteration*/) {
+                                                   int /*iteration*/) {
   FILE* jsonout = OpenFileForWriting(output_file_);
   std::stringstream stream;
   PrintJsonUnitTest(&stream, unit_test);
@@ -4562,55 +4569,48 @@ static std::string FormatEpochTimeInMillisAsRFC3339(TimeInMillis ms) {
     return "";
   // YYYY-MM-DDThh:mm:ss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
-      String::FormatIntWidth2(time_struct.tm_mday) + "T" +
-      String::FormatIntWidth2(time_struct.tm_hour) + ":" +
-      String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec) + "Z";
+         String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
+         String::FormatIntWidth2(time_struct.tm_mday) + "T" +
+         String::FormatIntWidth2(time_struct.tm_hour) + ":" +
+         String::FormatIntWidth2(time_struct.tm_min) + ":" +
+         String::FormatIntWidth2(time_struct.tm_sec) + "Z";
 }
 
 static inline std::string Indent(size_t width) {
   return std::string(width, ' ');
 }
 
-void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    const std::string& value,
-    const std::string& indent,
-    bool comma) {
+void JsonUnitTestResultPrinter::OutputJsonKey(std::ostream* stream,
+                                              const std::string& element_name,
+                                              const std::string& name,
+                                              const std::string& value,
+                                              const std::string& indent,
+                                              bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": \"" << EscapeJson(value) << "\"";
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 void JsonUnitTestResultPrinter::OutputJsonKey(
-    std::ostream* stream,
-    const std::string& element_name,
-    const std::string& name,
-    int value,
-    const std::string& indent,
-    bool comma) {
+    std::ostream* stream, const std::string& element_name,
+    const std::string& name, int value, const std::string& indent, bool comma) {
   const std::vector<std::string>& allowed_names =
       GetReservedOutputAttributesForElement(element_name);
 
   GTEST_CHECK_(std::find(allowed_names.begin(), allowed_names.end(), name) !=
-                   allowed_names.end())
+               allowed_names.end())
       << "Key \"" << name << "\" is not allowed for value \"" << element_name
       << "\".";
 
   *stream << indent << "\"" << name << "\": " << StreamableToString(value);
-  if (comma)
-    *stream << ",\n";
+  if (comma) *stream << ",\n";
 }
 
 // Streams a test suite JSON stanza containing the given test result.
@@ -4620,7 +4620,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
   *stream << Indent(4) << "{\n";
   OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
   OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
     OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
     OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
@@ -4674,11 +4674,14 @@ void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
     OutputJsonKey(stream, kTestsuite, "type_param", test_info.type_param(),
                   kIndent);
   }
-  if (GTEST_FLAG(list_tests)) {
-    OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
-    OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+
+  OutputJsonKey(stream, kTestsuite, "file", test_info.file(), kIndent);
+  OutputJsonKey(stream, kTestsuite, "line", test_info.line(), kIndent, false);
+  if (GTEST_FLAG_GET(list_tests)) {
     *stream << "\n" << Indent(8) << "}";
     return;
+  } else {
+    *stream << ",\n";
   }
 
   OutputJsonKey(stream, kTestsuite, "status",
@@ -4710,7 +4713,9 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     if (part.failed()) {
       *stream << ",\n";
       if (++failures == 1) {
-        *stream << kIndent << "\"" << "failures" << "\": [\n";
+        *stream << kIndent << "\""
+                << "failures"
+                << "\": [\n";
       }
       const std::string location =
           internal::FormatCompilerIndependentFileLocation(part.file_name(),
@@ -4723,8 +4728,7 @@ void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
     }
   }
 
-  if (failures > 0)
-    *stream << "\n" << kIndent << "]";
+  if (failures > 0) *stream << "\n" << kIndent << "]";
   *stream << "\n" << Indent(8) << "}";
 }
 
@@ -4738,7 +4742,7 @@ void JsonUnitTestResultPrinter::PrintJsonTestSuite(
   OutputJsonKey(stream, kTestsuite, "name", test_suite.name(), kIndent);
   OutputJsonKey(stream, kTestsuite, "tests", test_suite.reportable_test_count(),
                 kIndent);
-  if (!GTEST_FLAG(list_tests)) {
+  if (!GTEST_FLAG_GET(list_tests)) {
     OutputJsonKey(stream, kTestsuite, "failures",
                   test_suite.failed_test_count(), kIndent);
     OutputJsonKey(stream, kTestsuite, "disabled",
@@ -4785,7 +4789,7 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
   OutputJsonKey(stream, kTestsuites, "disabled",
                 unit_test.reportable_disabled_test_count(), kIndent);
   OutputJsonKey(stream, kTestsuites, "errors", 0, kIndent);
-  if (GTEST_FLAG(shuffle)) {
+  if (GTEST_FLAG_GET(shuffle)) {
     OutputJsonKey(stream, kTestsuites, "random_seed", unit_test.random_seed(),
                   kIndent);
   }
@@ -4820,7 +4824,9 @@ void JsonUnitTestResultPrinter::PrintJsonUnitTest(std::ostream* stream,
     OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
   }
 
-  *stream << "\n" << kIndent << "]\n" << "}\n";
+  *stream << "\n"
+          << kIndent << "]\n"
+          << "}\n";
 }
 
 void JsonUnitTestResultPrinter::PrintJsonTestList(
@@ -4855,7 +4861,8 @@ std::string JsonUnitTestResultPrinter::TestPropertiesAsJson(
   Message attributes;
   for (int i = 0; i < result.test_property_count(); ++i) {
     const TestProperty& property = result.GetTestProperty(i);
-    attributes << ",\n" << indent << "\"" << property.key() << "\": "
+    attributes << ",\n"
+               << indent << "\"" << property.key() << "\": "
                << "\"" << EscapeJson(property.value()) << "\"";
   }
   return attributes.GetString();
@@ -4895,14 +4902,14 @@ void StreamingListener::SocketWriter::MakeConnection() {
 
   addrinfo hints;
   memset(&hints, 0, sizeof(hints));
-  hints.ai_family = AF_UNSPEC;    // To allow both IPv4 and IPv6 addresses.
+  hints.ai_family = AF_UNSPEC;  // To allow both IPv4 and IPv6 addresses.
   hints.ai_socktype = SOCK_STREAM;
   addrinfo* servinfo = nullptr;
 
   // Use the getaddrinfo() to get a linked list of IP addresses for
   // the given host name.
-  const int error_num = getaddrinfo(
-      host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
+  const int error_num =
+      getaddrinfo(host_name_.c_str(), port_num_.c_str(), &hints, &servinfo);
   if (error_num != 0) {
     GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: "
                         << gai_strerror(error_num);
@@ -4911,8 +4918,8 @@ void StreamingListener::SocketWriter::MakeConnection() {
   // Loop through all the results and connect to the first we can.
   for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != nullptr;
        cur_addr = cur_addr->ai_next) {
-    sockfd_ = socket(
-        cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol);
+    sockfd_ = socket(cur_addr->ai_family, cur_addr->ai_socktype,
+                     cur_addr->ai_protocol);
     if (sockfd_ != -1) {
       // Connect the client socket to the server socket.
       if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) {
@@ -4962,7 +4969,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   for (int i = 0; i < raw_stack_size; ++i) {
     if (raw_stack[i] == caller_frame &&
-        !GTEST_FLAG(show_internal_stack_frames)) {
+        !GTEST_FLAG_GET(show_internal_stack_frames)) {
       // Add a marker to the trace and stop adding frames.
       absl::StrAppend(&result, kElidedFramesMarker, "\n");
       break;
@@ -4981,7 +4988,7 @@ std::string OsStackTraceGetter::CurrentStackTrace(int max_depth, int skip_count)
 
   return result;
 
-#else  // !GTEST_HAS_ABSL
+#else   // !GTEST_HAS_ABSL
   static_cast<void>(max_depth);
   static_cast<void>(skip_count);
   return "";
@@ -5005,14 +5012,14 @@ void OsStackTraceGetter::UponLeavingGTest() GTEST_LOCK_EXCLUDED_(mutex_) {
 class ScopedPrematureExitFile {
  public:
   explicit ScopedPrematureExitFile(const char* premature_exit_filepath)
-      : premature_exit_filepath_(premature_exit_filepath ?
-                                 premature_exit_filepath : "") {
+      : premature_exit_filepath_(
+            premature_exit_filepath ? premature_exit_filepath : "") {
     // If a path to the premature-exit file is specified...
     if (!premature_exit_filepath_.empty()) {
       // create the file with a single "0" character in it.  I/O
       // errors are ignored as there's nothing better we can do and we
       // don't want to fail the test because of this.
-      FILE* pfile = posix::FOpen(premature_exit_filepath, "w");
+      FILE* pfile = posix::FOpen(premature_exit_filepath_.c_str(), "w");
       fwrite("0", 1, 1, pfile);
       fclose(pfile);
     }
@@ -5034,7 +5041,8 @@ class ScopedPrematureExitFile {
  private:
   const std::string premature_exit_filepath_;
 
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedPrematureExitFile);
+  ScopedPrematureExitFile(const ScopedPrematureExitFile&) = delete;
+  ScopedPrematureExitFile& operator=(const ScopedPrematureExitFile&) = delete;
 };
 
 }  // namespace internal
@@ -5208,7 +5216,7 @@ int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); }
 // Gets the time of the test program start, in ms from the start of the
 // UNIX epoch.
 internal::TimeInMillis UnitTest::start_timestamp() const {
-    return impl()->start_timestamp();
+  return impl()->start_timestamp();
 }
 
 // Gets the elapsed time, in milliseconds.
@@ -5251,9 +5259,7 @@ TestSuite* UnitTest::GetMutableTestSuite(int i) {
 
 // Returns the list of event listeners that can be used to track events
 // inside Google Test.
-TestEventListeners& UnitTest::listeners() {
-  return *impl()->listeners();
-}
+TestEventListeners& UnitTest::listeners() { return *impl()->listeners(); }
 
 // Registers and returns a global test environment.  When a test
 // program is run, all global test environments will be set-up in the
@@ -5278,12 +5284,11 @@ Environment* UnitTest::AddEnvironment(Environment* env) {
 // assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call
 // this to report their results.  The user code should use the
 // assertion macros instead of calling this directly.
-void UnitTest::AddTestPartResult(
-    TestPartResult::Type result_type,
-    const char* file_name,
-    int line_number,
-    const std::string& message,
-    const std::string& os_stack_trace) GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::AddTestPartResult(TestPartResult::Type result_type,
+                                 const char* file_name, int line_number,
+                                 const std::string& message,
+                                 const std::string& os_stack_trace)
+    GTEST_LOCK_EXCLUDED_(mutex_) {
   Message msg;
   msg << message;
 
@@ -5293,8 +5298,9 @@ void UnitTest::AddTestPartResult(
 
     for (size_t i = impl_->gtest_trace_stack().size(); i > 0; --i) {
       const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1];
-      msg << "\n" << internal::FormatFileLocation(trace.file, trace.line)
-          << " " << trace.message;
+      msg << "\n"
+          << internal::FormatFileLocation(trace.file, trace.line) << " "
+          << trace.message;
     }
   }
 
@@ -5304,8 +5310,8 @@ void UnitTest::AddTestPartResult(
 
   const TestPartResult result = TestPartResult(
       result_type, file_name, line_number, msg.GetString().c_str());
-  impl_->GetTestPartResultReporterForCurrentThread()->
-      ReportTestPartResult(result);
+  impl_->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      result);
 
   if (result_type != TestPartResult::kSuccess &&
       result_type != TestPartResult::kSkip) {
@@ -5314,7 +5320,7 @@ void UnitTest::AddTestPartResult(
     // in the code (perhaps in order to use Google Test assertions
     // with another testing framework) and specify the former on the
     // command line for debugging.
-    if (GTEST_FLAG(break_on_failure)) {
+    if (GTEST_FLAG_GET(break_on_failure)) {
 #if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
       // Using DebugBreak on Windows allows gtest to still break into a debugger
       // when a failure happens and both the --gtest_break_on_failure and
@@ -5331,7 +5337,7 @@ void UnitTest::AddTestPartResult(
       // portability: some debuggers don't correctly trap abort().
       *static_cast<volatile int*>(nullptr) = 1;
 #endif  // GTEST_OS_WINDOWS
-    } else if (GTEST_FLAG(throw_on_failure)) {
+    } else if (GTEST_FLAG_GET(throw_on_failure)) {
 #if GTEST_HAS_EXCEPTIONS
       throw internal::GoogleTestFailureException(result);
 #else
@@ -5360,7 +5366,7 @@ void UnitTest::RecordProperty(const std::string& key,
 // from the main thread.
 int UnitTest::Run() {
   const bool in_death_test_child_process =
-      internal::GTEST_FLAG(internal_run_death_test).length() > 0;
+      GTEST_FLAG_GET(internal_run_death_test).length() > 0;
 
   // Google Test implements this protocol for catching that a test
   // program exits before returning control to Google Test:
@@ -5390,7 +5396,7 @@ int UnitTest::Run() {
 
   // Captures the value of GTEST_FLAG(catch_exceptions).  This value will be
   // used for the duration of the program.
-  impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions));
+  impl()->set_catch_exceptions(GTEST_FLAG_GET(catch_exceptions));
 
 #if GTEST_OS_WINDOWS
   // Either the user wants Google Test to catch exceptions thrown by the
@@ -5398,26 +5404,26 @@ int UnitTest::Run() {
   // process. In either case the user does not want to see pop-up dialogs
   // about crashes - they are expected.
   if (impl()->catch_exceptions() || in_death_test_child_process) {
-# if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
     // SetErrorMode doesn't exist on CE.
     SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT |
                  SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX);
-# endif  // !GTEST_OS_WINDOWS_MOBILE
+#endif  // !GTEST_OS_WINDOWS_MOBILE
 
-# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
+#if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE
     // Death test children can be terminated with _abort().  On Windows,
     // _abort() can show a dialog with a warning message.  This forces the
     // abort message to go to stderr instead.
     _set_error_mode(_OUT_TO_STDERR);
-# endif
+#endif
 
-# if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
+#if defined(_MSC_VER) && !GTEST_OS_WINDOWS_MOBILE
     // In the debug version, Visual Studio pops up a separate dialog
     // offering a choice to debug the aborted program. We need to suppress
     // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement
     // executed. Google Test will notify the user of any unexpected
     // failure via stderr.
-    if (!GTEST_FLAG(break_on_failure))
+    if (!GTEST_FLAG_GET(break_on_failure))
       _set_abort_behavior(
           0x0,                                    // Clear the following flags:
           _WRITE_ABORT_MSG | _CALL_REPORTFAULT);  // pop-up window, core dump.
@@ -5431,14 +5437,15 @@ int UnitTest::Run() {
                               _CRTDBG_MODE_FILE | _CRTDBG_MODE_DEBUG);
       (void)_CrtSetReportFile(_CRT_ASSERT, _CRTDBG_FILE_STDERR);
     }
-# endif
+#endif
   }
 #endif  // GTEST_OS_WINDOWS
 
   return internal::HandleExceptionsInMethodIfSupported(
-      impl(),
-      &internal::UnitTestImpl::RunAllTests,
-      "auxiliary test code (environments or event listeners)") ? 0 : 1;
+             impl(), &internal::UnitTestImpl::RunAllTests,
+             "auxiliary test code (environments or event listeners)")
+             ? 0
+             : 1;
 }
 
 // Returns the working directory when the first TEST() or TEST_F() was
@@ -5483,14 +5490,10 @@ UnitTest::parameterized_test_registry() GTEST_LOCK_EXCLUDED_(mutex_) {
 }
 
 // Creates an empty UnitTest.
-UnitTest::UnitTest() {
-  impl_ = new internal::UnitTestImpl(this);
-}
+UnitTest::UnitTest() { impl_ = new internal::UnitTestImpl(this); }
 
 // Destructor of UnitTest.
-UnitTest::~UnitTest() {
-  delete impl_;
-}
+UnitTest::~UnitTest() { delete impl_; }
 
 // Pushes a trace defined by SCOPED_TRACE() on to the per-thread
 // Google Test trace stack.
@@ -5501,8 +5504,7 @@ void UnitTest::PushGTestTrace(const internal::TraceInfo& trace)
 }
 
 // Pops a trace from the per-thread Google Test trace stack.
-void UnitTest::PopGTestTrace()
-    GTEST_LOCK_EXCLUDED_(mutex_) {
+void UnitTest::PopGTestTrace() GTEST_LOCK_EXCLUDED_(mutex_) {
   internal::MutexLock lock(&mutex_);
   impl_->gtest_trace_stack().pop_back();
 }
@@ -5599,12 +5601,12 @@ void UnitTestImpl::ConfigureXmlOutput() {
 // Initializes event listeners for streaming test results in string form.
 // Must not be called before InitGoogleTest.
 void UnitTestImpl::ConfigureStreamingOutput() {
-  const std::string& target = GTEST_FLAG(stream_result_to);
+  const std::string& target = GTEST_FLAG_GET(stream_result_to);
   if (!target.empty()) {
     const size_t pos = target.find(':');
     if (pos != std::string::npos) {
-      listeners()->Append(new StreamingListener(target.substr(0, pos),
-                                                target.substr(pos+1)));
+      listeners()->Append(
+          new StreamingListener(target.substr(0, pos), target.substr(pos + 1)));
     } else {
       GTEST_LOG_(WARNING) << "unrecognized streaming target \"" << target
                           << "\" ignored.";
@@ -5642,7 +5644,7 @@ void UnitTestImpl::PostFlagParsingInit() {
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
-    if (GTEST_FLAG(brief)) {
+    if (GTEST_FLAG_GET(brief)) {
       listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
     }
 
@@ -5652,7 +5654,7 @@ void UnitTestImpl::PostFlagParsingInit() {
 #endif  // GTEST_CAN_STREAM_RESULTS_
 
 #if GTEST_HAS_ABSL
-    if (GTEST_FLAG(install_failure_signal_handler)) {
+    if (GTEST_FLAG_GET(install_failure_signal_handler)) {
       absl::FailureSignalHandlerOptions options;
       absl::InstallFailureSignalHandler(options);
     }
@@ -5710,9 +5712,9 @@ TestSuite* UnitTestImpl::GetTestSuite(
   auto* const new_test_suite =
       new TestSuite(test_suite_name, type_param, set_up_tc, tear_down_tc);
 
+  const UnitTestFilter death_test_suite_filter(kDeathTestSuiteFilter);
   // Is this a death test suite?
-  if (internal::UnitTestOptions::MatchesFilter(test_suite_name,
-                                               kDeathTestSuiteFilter)) {
+  if (death_test_suite_filter.MatchesName(test_suite_name)) {
     // Yes.  Inserts the test suite after the last death test suite
     // defined so far.  This only works when the test suites haven't
     // been shuffled.  Otherwise we may end up running a death test
@@ -5749,8 +5751,7 @@ bool UnitTestImpl::RunAllTests() {
   const bool gtest_is_initialized_before_run_all_tests = GTestIsInitialized();
 
   // Do not run any test if the --help flag was specified.
-  if (g_help_flag)
-    return true;
+  if (g_help_flag) return true;
 
   // Repeats the call to the post-flag parsing initialization in case the
   // user didn't call InitGoogleTest.
@@ -5768,11 +5769,11 @@ bool UnitTestImpl::RunAllTests() {
 #if GTEST_HAS_DEATH_TEST
   in_subprocess_for_death_test =
       (internal_run_death_test_flag_.get() != nullptr);
-# if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#if defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
   if (in_subprocess_for_death_test) {
     GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_();
   }
-# endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
+#endif  // defined(GTEST_EXTRA_DEATH_TEST_CHILD_SETUP_)
 #endif  // GTEST_HAS_DEATH_TEST
 
   const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex,
@@ -5780,19 +5781,18 @@ bool UnitTestImpl::RunAllTests() {
 
   // Compares the full test names with the filter to decide which
   // tests to run.
-  const bool has_tests_to_run = FilterTests(should_shard
-                                              ? HONOR_SHARDING_PROTOCOL
-                                              : IGNORE_SHARDING_PROTOCOL) > 0;
+  const bool has_tests_to_run =
+      FilterTests(should_shard ? HONOR_SHARDING_PROTOCOL
+                               : IGNORE_SHARDING_PROTOCOL) > 0;
 
   // Lists the tests and exits if the --gtest_list_tests flag was specified.
-  if (GTEST_FLAG(list_tests)) {
+  if (GTEST_FLAG_GET(list_tests)) {
     // This must be called *after* FilterTests() has been called.
     ListTestsMatchingFilter();
     return true;
   }
 
-  random_seed_ = GTEST_FLAG(shuffle) ?
-      GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0;
+  random_seed_ = GetRandomSeedFromFlag(GTEST_FLAG_GET(random_seed));
 
   // True if and only if at least one test has failed.
   bool failed = false;
@@ -5804,9 +5804,21 @@ bool UnitTestImpl::RunAllTests() {
 
   // How many times to repeat the tests?  We don't want to repeat them
   // when we are inside the subprocess of a death test.
-  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat);
+  const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG_GET(repeat);
+
   // Repeats forever if the repeat count is negative.
   const bool gtest_repeat_forever = repeat < 0;
+
+  // Should test environments be set up and torn down for each repeat, or only
+  // set up on the first and torn down on the last iteration? If there is no
+  // "last" iteration because the tests will repeat forever, always recreate the
+  // environments to avoid leaks in case one of the environments is using
+  // resources that are external to this process. Without this check there would
+  // be no way to clean up those external resources automatically.
+  const bool recreate_environments_when_repeating =
+      GTEST_FLAG_GET(recreate_environments_when_repeating) ||
+      gtest_repeat_forever;
+
   for (int i = 0; gtest_repeat_forever || i != repeat; i++) {
     // We want to preserve failures generated by ad-hoc test
     // assertions executed before RUN_ALL_TESTS().
@@ -5815,7 +5827,7 @@ bool UnitTestImpl::RunAllTests() {
     Timer timer;
 
     // Shuffles test suites and tests if requested.
-    if (has_tests_to_run && GTEST_FLAG(shuffle)) {
+    if (has_tests_to_run && GTEST_FLAG_GET(shuffle)) {
       random()->Reseed(static_cast<uint32_t>(random_seed_));
       // This should be done before calling OnTestIterationStart(),
       // such that a test event listener can see the actual test order
@@ -5828,10 +5840,13 @@ bool UnitTestImpl::RunAllTests() {
 
     // Runs each test suite if there is at least one test to run.
     if (has_tests_to_run) {
-      // Sets up all environments beforehand.
-      repeater->OnEnvironmentsSetUpStart(*parent_);
-      ForEach(environments_, SetUpEnvironment);
-      repeater->OnEnvironmentsSetUpEnd(*parent_);
+      // Sets up all environments beforehand. If test environments aren't
+      // recreated for each iteration, only do so on the first iteration.
+      if (i == 0 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsSetUpStart(*parent_);
+        ForEach(environments_, SetUpEnvironment);
+        repeater->OnEnvironmentsSetUpEnd(*parent_);
+      }
 
       // Runs the tests only if there was no fatal failure or skip triggered
       // during global set-up.
@@ -5853,7 +5868,7 @@ bool UnitTestImpl::RunAllTests() {
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
-          if (GTEST_FLAG(fail_fast) &&
+          if (GTEST_FLAG_GET(fail_fast) &&
               GetMutableSuiteCase(test_index)->Failed()) {
             for (int j = test_index + 1; j < total_test_suite_count(); j++) {
               GetMutableSuiteCase(j)->Skip();
@@ -5871,11 +5886,15 @@ bool UnitTestImpl::RunAllTests() {
         }
       }
 
-      // Tears down all environments in reverse order afterwards.
-      repeater->OnEnvironmentsTearDownStart(*parent_);
-      std::for_each(environments_.rbegin(), environments_.rend(),
-                    TearDownEnvironment);
-      repeater->OnEnvironmentsTearDownEnd(*parent_);
+      // Tears down all environments in reverse order afterwards. If test
+      // environments aren't recreated for each iteration, only do so on the
+      // last iteration.
+      if (i == repeat - 1 || recreate_environments_when_repeating) {
+        repeater->OnEnvironmentsTearDownStart(*parent_);
+        std::for_each(environments_.rbegin(), environments_.rend(),
+                      TearDownEnvironment);
+        repeater->OnEnvironmentsTearDownEnd(*parent_);
+      }
     }
 
     elapsed_time_ = timer.Elapsed();
@@ -5896,7 +5915,7 @@ bool UnitTestImpl::RunAllTests() {
     // (it's always safe to unshuffle the tests).
     UnshuffleTests();
 
-    if (GTEST_FLAG(shuffle)) {
+    if (GTEST_FLAG_GET(shuffle)) {
       // Picks a new random seed for each iteration.
       random_seed_ = GetNextRandomSeed(random_seed_);
     }
@@ -5947,8 +5966,7 @@ void WriteToShardStatusFileIfNeeded() {
 // an error and exits. If in_subprocess_for_death_test, sharding is
 // disabled because it must only be applied to the original test
 // process. Otherwise, we could filter out death tests we intended to execute.
-bool ShouldShard(const char* total_shards_env,
-                 const char* shard_index_env,
+bool ShouldShard(const char* total_shards_env, const char* shard_index_env,
                  bool in_subprocess_for_death_test) {
   if (in_subprocess_for_death_test) {
     return false;
@@ -5960,27 +5978,27 @@ bool ShouldShard(const char* total_shards_env,
   if (total_shards == -1 && shard_index == -1) {
     return false;
   } else if (total_shards == -1 && shard_index != -1) {
-    const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestShardIndex << " = " << shard_index
-      << ", but have left " << kTestTotalShards << " unset.\n";
+    const Message msg = Message() << "Invalid environment variables: you have "
+                                  << kTestShardIndex << " = " << shard_index
+                                  << ", but have left " << kTestTotalShards
+                                  << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
     const Message msg = Message()
-      << "Invalid environment variables: you have "
-      << kTestTotalShards << " = " << total_shards
-      << ", but have left " << kTestShardIndex << " unset.\n";
+                        << "Invalid environment variables: you have "
+                        << kTestTotalShards << " = " << total_shards
+                        << ", but have left " << kTestShardIndex << " unset.\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
-    const Message msg = Message()
-      << "Invalid environment variables: we require 0 <= "
-      << kTestShardIndex << " < " << kTestTotalShards
-      << ", but you have " << kTestShardIndex << "=" << shard_index
-      << ", " << kTestTotalShards << "=" << total_shards << ".\n";
+    const Message msg =
+        Message() << "Invalid environment variables: we require 0 <= "
+                  << kTestShardIndex << " < " << kTestTotalShards
+                  << ", but you have " << kTestShardIndex << "=" << shard_index
+                  << ", " << kTestTotalShards << "=" << total_shards << ".\n";
     ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
@@ -6022,11 +6040,16 @@ bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) {
 // https://github.com/google/googletest/blob/master/googletest/docs/advanced.md
 // . Returns the number of tests that should run.
 int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
-  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestTotalShards, -1) : -1;
-  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ?
-      Int32FromEnvOrDie(kTestShardIndex, -1) : -1;
-
+  const int32_t total_shards = shard_tests == HONOR_SHARDING_PROTOCOL
+                                   ? Int32FromEnvOrDie(kTestTotalShards, -1)
+                                   : -1;
+  const int32_t shard_index = shard_tests == HONOR_SHARDING_PROTOCOL
+                                  ? Int32FromEnvOrDie(kTestShardIndex, -1)
+                                  : -1;
+
+  const PositiveAndNegativeUnitTestFilter gtest_flag_filter(
+      GTEST_FLAG_GET(filter));
+  const UnitTestFilter disable_test_filter(kDisableTestFilter);
   // num_runnable_tests are the number of tests that will
   // run across all shards (i.e., match filter and are not disabled).
   // num_selected_tests are the number of tests to be run on
@@ -6042,18 +6065,17 @@ int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) {
       const std::string test_name(test_info->name());
       // A test is disabled if test suite name or test name matches
       // kDisableTestFilter.
-      const bool is_disabled = internal::UnitTestOptions::MatchesFilter(
-                                   test_suite_name, kDisableTestFilter) ||
-                               internal::UnitTestOptions::MatchesFilter(
-                                   test_name, kDisableTestFilter);
+      const bool is_disabled =
+          disable_test_filter.MatchesName(test_suite_name) ||
+          disable_test_filter.MatchesName(test_name);
       test_info->is_disabled_ = is_disabled;
 
-      const bool matches_filter = internal::UnitTestOptions::FilterMatchesTest(
-          test_suite_name, test_name);
+      const bool matches_filter =
+          gtest_flag_filter.MatchesTest(test_suite_name, test_name);
       test_info->matches_filter_ = matches_filter;
 
       const bool is_runnable =
-          (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) &&
+          (GTEST_FLAG_GET(also_run_disabled_tests) || !is_disabled) &&
           matches_filter;
 
       const bool is_in_another_shard =
@@ -6222,8 +6244,8 @@ void UnitTestImpl::UnshuffleTests() {
 // For example, if Foo() calls Bar(), which in turn calls
 // GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in
 // the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't.
-std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
-                                            int skip_count) {
+GTEST_NO_INLINE_ GTEST_NO_TAIL_CALL_ std::string
+GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, int skip_count) {
   // We pass skip_count + 1 to skip this wrapper function in addition
   // to what the user really wants to skip.
   return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1);
@@ -6233,7 +6255,7 @@ std::string GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/,
 // suppress unreachable code warnings.
 namespace {
 class ClassUniqueToAlwaysTrue {};
-}
+}  // namespace
 
 bool IsTrue(bool condition) { return condition; }
 
@@ -6241,8 +6263,7 @@ bool AlwaysTrue() {
 #if GTEST_HAS_EXCEPTIONS
   // This condition is always false so AlwaysTrue() never actually throws,
   // but it makes the compiler think that it may throw.
-  if (IsTrue(false))
-    throw ClassUniqueToAlwaysTrue();
+  if (IsTrue(false)) throw ClassUniqueToAlwaysTrue();
 #endif  // GTEST_HAS_EXCEPTIONS
   return true;
 }
@@ -6264,13 +6285,14 @@ bool SkipPrefix(const char* prefix, const char** pstr) {
 // part can be omitted.
 //
 // Returns the value of the flag, or NULL if the parsing failed.
-static const char* ParseFlagValue(const char* str, const char* flag,
+static const char* ParseFlagValue(const char* str, const char* flag_name,
                                   bool def_optional) {
   // str and flag must not be NULL.
-  if (str == nullptr || flag == nullptr) return nullptr;
+  if (str == nullptr || flag_name == nullptr) return nullptr;
 
   // The flag must start with "--" followed by GTEST_FLAG_PREFIX_.
-  const std::string flag_str = std::string("--") + GTEST_FLAG_PREFIX_ + flag;
+  const std::string flag_str =
+      std::string("--") + GTEST_FLAG_PREFIX_ + flag_name;
   const size_t flag_len = flag_str.length();
   if (strncmp(str, flag_str.c_str(), flag_len) != 0) return nullptr;
 
@@ -6301,9 +6323,9 @@ static const char* ParseFlagValue(const char* str, const char* flag,
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
+static bool ParseFlag(const char* str, const char* flag_name, bool* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, true);
+  const char* const value_str = ParseFlagValue(str, flag_name, true);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6317,16 +6339,16 @@ static bool ParseBoolFlag(const char* str, const char* flag, bool* value) {
 //
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
-bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
+bool ParseFlag(const char* str, const char* flag_name, int32_t* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
 
   // Sets *value to the value of the flag.
-  return ParseInt32(Message() << "The value of flag --" << flag,
-                    value_str, value);
+  return ParseInt32(Message() << "The value of flag --" << flag_name, value_str,
+                    value);
 }
 
 // Parses a string for a string flag, in the form of "--flag=value".
@@ -6334,9 +6356,9 @@ bool ParseInt32Flag(const char* str, const char* flag, int32_t* value) {
 // On success, stores the value of the flag in *value, and returns
 // true.  On failure, returns false without changing *value.
 template <typename String>
-static bool ParseStringFlag(const char* str, const char* flag, String* value) {
+static bool ParseFlag(const char* str, const char* flag_name, String* value) {
   // Gets the value of the flag as a string.
-  const char* const value_str = ParseFlagValue(str, flag, false);
+  const char* const value_str = ParseFlagValue(str, flag_name, false);
 
   // Aborts if the parsing failed.
   if (value_str == nullptr) return false;
@@ -6353,8 +6375,7 @@ static bool ParseStringFlag(const char* str, const char* flag, String* value) {
 // GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test
 // internal flags and do not trigger the help message.
 static bool HasGoogleTestFlagPrefix(const char* str) {
-  return (SkipPrefix("--", &str) ||
-          SkipPrefix("-", &str) ||
+  return (SkipPrefix("--", &str) || SkipPrefix("-", &str) ||
           SkipPrefix("/", &str)) &&
          !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) &&
          (SkipPrefix(GTEST_FLAG_PREFIX_, &str) ||
@@ -6437,6 +6458,10 @@ static const char kColorEncodedHelpMessage[] =
     "random_seed=@Y[NUMBER]@D\n"
     "      Random number seed to use for shuffling test orders (between 1 and\n"
     "      99999, or 0 to use a seed based on the current time).\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "recreate_environments_when_repeating@D\n"
+    "      Sets up and tears down the global test environment on each repeat\n"
+    "      of the test.\n"
     "\n"
     "Test Output:\n"
     "  @G--" GTEST_FLAG_PREFIX_
@@ -6454,18 +6479,18 @@ static const char kColorEncodedHelpMessage[] =
     "      Generate a JSON or XML report in the given directory or with the "
     "given\n"
     "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
-# if GTEST_CAN_STREAM_RESULTS_
+#if GTEST_CAN_STREAM_RESULTS_
     "  @G--" GTEST_FLAG_PREFIX_
     "stream_result_to=@YHOST@G:@YPORT@D\n"
     "      Stream test results to the given server.\n"
-# endif  // GTEST_CAN_STREAM_RESULTS_
+#endif  // GTEST_CAN_STREAM_RESULTS_
     "\n"
     "Assertion Behavior:\n"
-# if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
     "      Set the default death test style.\n"
-# endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
+#endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
     "  @G--" GTEST_FLAG_PREFIX_
     "break_on_failure@D\n"
     "      Turn assertion failures into debugger break-points.\n"
@@ -6497,41 +6522,44 @@ static const char kColorEncodedHelpMessage[] =
     "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
-  return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
-                       &GTEST_FLAG(also_run_disabled_tests)) ||
-         ParseBoolFlag(arg, kBreakOnFailureFlag,
-                       &GTEST_FLAG(break_on_failure)) ||
-         ParseBoolFlag(arg, kCatchExceptionsFlag,
-                       &GTEST_FLAG(catch_exceptions)) ||
-         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-         ParseStringFlag(arg, kDeathTestStyleFlag,
-                         &GTEST_FLAG(death_test_style)) ||
-         ParseBoolFlag(arg, kDeathTestUseFork,
-                       &GTEST_FLAG(death_test_use_fork)) ||
-         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
-         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-         ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                         &GTEST_FLAG(internal_run_death_test)) ||
-         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
-         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-         ParseInt32Flag(arg, kStackTraceDepthFlag,
-                        &GTEST_FLAG(stack_trace_depth)) ||
-         ParseStringFlag(arg, kStreamResultToFlag,
-                         &GTEST_FLAG(stream_result_to)) ||
-         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
+#define GTEST_INTERNAL_PARSE_FLAG(flag_name)  \
+  do {                                        \
+    auto value = GTEST_FLAG_GET(flag_name);   \
+    if (ParseFlag(arg, #flag_name, &value)) { \
+      GTEST_FLAG_SET(flag_name, value);       \
+      return true;                            \
+    }                                         \
+  } while (false)
+
+  GTEST_INTERNAL_PARSE_FLAG(also_run_disabled_tests);
+  GTEST_INTERNAL_PARSE_FLAG(break_on_failure);
+  GTEST_INTERNAL_PARSE_FLAG(catch_exceptions);
+  GTEST_INTERNAL_PARSE_FLAG(color);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_style);
+  GTEST_INTERNAL_PARSE_FLAG(death_test_use_fork);
+  GTEST_INTERNAL_PARSE_FLAG(fail_fast);
+  GTEST_INTERNAL_PARSE_FLAG(filter);
+  GTEST_INTERNAL_PARSE_FLAG(internal_run_death_test);
+  GTEST_INTERNAL_PARSE_FLAG(list_tests);
+  GTEST_INTERNAL_PARSE_FLAG(output);
+  GTEST_INTERNAL_PARSE_FLAG(brief);
+  GTEST_INTERNAL_PARSE_FLAG(print_time);
+  GTEST_INTERNAL_PARSE_FLAG(print_utf8);
+  GTEST_INTERNAL_PARSE_FLAG(random_seed);
+  GTEST_INTERNAL_PARSE_FLAG(repeat);
+  GTEST_INTERNAL_PARSE_FLAG(recreate_environments_when_repeating);
+  GTEST_INTERNAL_PARSE_FLAG(shuffle);
+  GTEST_INTERNAL_PARSE_FLAG(stack_trace_depth);
+  GTEST_INTERNAL_PARSE_FLAG(stream_result_to);
+  GTEST_INTERNAL_PARSE_FLAG(throw_on_failure);
+  return false;
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
 static void LoadFlagsFromFile(const std::string& path) {
   FILE* flagfile = posix::FOpen(path.c_str(), "r");
   if (!flagfile) {
-    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG(flagfile)
+    GTEST_LOG_(FATAL) << "Unable to open file \"" << GTEST_FLAG_GET(flagfile)
                       << "\"";
   }
   std::string contents(ReadEntireFile(flagfile));
@@ -6539,10 +6567,8 @@ static void LoadFlagsFromFile(const std::string& path) {
   std::vector<std::string> lines;
   SplitString(contents, '\n', &lines);
   for (size_t i = 0; i < lines.size(); ++i) {
-    if (lines[i].empty())
-      continue;
-    if (!ParseGoogleTestFlag(lines[i].c_str()))
-      g_help_flag = true;
+    if (lines[i].empty()) continue;
+    if (!ParseGoogleTestFlag(lines[i].c_str())) g_help_flag = true;
   }
 }
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6552,25 +6578,23 @@ static void LoadFlagsFromFile(const std::string& path) {
 // instantiated to either char or wchar_t.
 template <typename CharType>
 void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
+  std::string flagfile_value;
   for (int i = 1; i < *argc; i++) {
     const std::string arg_string = StreamableToString(argv[i]);
     const char* const arg = arg_string.c_str();
 
-    using internal::ParseBoolFlag;
-    using internal::ParseInt32Flag;
-    using internal::ParseStringFlag;
+    using internal::ParseFlag;
 
     bool remove_flag = false;
     if (ParseGoogleTestFlag(arg)) {
       remove_flag = true;
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (ParseStringFlag(arg, kFlagfileFlag, &GTEST_FLAG(flagfile))) {
-      LoadFlagsFromFile(GTEST_FLAG(flagfile));
+    } else if (ParseFlag(arg, "flagfile", &flagfile_value)) {
+      GTEST_FLAG_SET(flagfile, flagfile_value);
+      LoadFlagsFromFile(flagfile_value);
       remove_flag = true;
 #endif  // GTEST_USE_OWN_FLAGFILE_FLAG_
-    } else if (arg_string == "--help" || arg_string == "-h" ||
-               arg_string == "-?" || arg_string == "/?" ||
-               HasGoogleTestFlagPrefix(arg)) {
+    } else if (arg_string == "--help" || HasGoogleTestFlagPrefix(arg)) {
       // Both help flag and unrecognized Google Test flags (excluding
       // internal ones) trigger help display.
       g_help_flag = true;
@@ -6605,7 +6629,27 @@ void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) {
 // Parses the command line for Google Test flags, without initializing
 // other parts of Google Test.
 void ParseGoogleTestFlagsOnly(int* argc, char** argv) {
+#if GTEST_HAS_ABSL
+  if (*argc > 0) {
+    // absl::ParseCommandLine() requires *argc > 0.
+    auto positional_args = absl::flags_internal::ParseCommandLineImpl(
+        *argc, argv, absl::flags_internal::ArgvListAction::kRemoveParsedArgs,
+        absl::flags_internal::UsageFlagsAction::kHandleUsage,
+        absl::flags_internal::OnUndefinedFlag::kReportUndefined);
+    // Any command-line positional arguments not part of any command-line flag
+    // (or arguments to a flag) are copied back out to argv, with the program
+    // invocation name at position 0, and argc is resized. This includes
+    // positional arguments after the flag-terminating delimiter '--'.
+    // See https://abseil.io/docs/cpp/guides/flags.
+    std::copy(positional_args.begin(), positional_args.end(), argv);
+    if (static_cast<int>(positional_args.size()) < *argc) {
+      argv[positional_args.size()] = nullptr;
+      *argc = static_cast<int>(positional_args.size());
+    }
+  }
+#else
   ParseGoogleTestFlagsOnlyImpl(argc, argv);
+#endif
 
   // Fix the value of *_NSGetArgc() on macOS, but if and only if
   // *_NSGetArgv() == argv
@@ -6640,6 +6684,12 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 
 #if GTEST_HAS_ABSL
   absl::InitializeSymbolizer(g_argvs[0].c_str());
+
+  // When using the Abseil Flags library, set the program usage message to the
+  // help message, but remove the color-encoding from the message first.
+  absl::SetProgramUsageMessage(absl::StrReplaceAll(
+      kColorEncodedHelpMessage,
+      {{"@D", ""}, {"@R", ""}, {"@G", ""}, {"@Y", ""}, {"@@", "@"}}));
 #endif  // GTEST_HAS_ABSL
 
   ParseGoogleTestFlagsOnly(argc, argv);
@@ -6660,7 +6710,7 @@ void InitGoogleTestImpl(int* argc, CharType** argv) {
 void InitGoogleTest(int* argc, char** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6670,7 +6720,7 @@ void InitGoogleTest(int* argc, char** argv) {
 void InitGoogleTest(int* argc, wchar_t** argv) {
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
@@ -6686,42 +6736,42 @@ void InitGoogleTest() {
 
 #if defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_(&argc, argv);
-#else  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
+#else   // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
   internal::InitGoogleTestImpl(&argc, argv);
 #endif  // defined(GTEST_CUSTOM_INIT_GOOGLE_TEST_FUNCTION_)
 }
 
+#if !defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
+// Return value of first environment variable that is set and contains
+// a non-empty string. If there are none, return the "fallback" string.
+// Since we like the temporary directory to have a directory separator suffix,
+// add it if not provided in the environment variable value.
+static std::string GetTempDirFromEnv(
+    std::initializer_list<const char*> environment_variables,
+    const char* fallback, char separator) {
+  for (const char* variable_name : environment_variables) {
+    const char* value = internal::posix::GetEnv(variable_name);
+    if (value != nullptr && value[0] != '\0') {
+      if (value[strlen(value) - 1] != separator) {
+        return std::string(value).append(1, separator);
+      }
+      return value;
+    }
+  }
+  return fallback;
+}
+#endif
+
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#elif GTEST_OS_WINDOWS_MOBILE
-  return "\\temp\\";
-#elif GTEST_OS_WINDOWS
-  const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "\\temp\\";
-  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
-    return temp_dir;
-  } else {
-    return std::string(temp_dir) + "\\";
-  }
+#elif GTEST_OS_WINDOWS || GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TEMP"}, "\\temp\\", '\\');
 #elif GTEST_OS_LINUX_ANDROID
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/data/local/tmp/";
-  } else {
-    return temp_dir;
-  }
-#elif GTEST_OS_LINUX
-  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0') {
-    return "/tmp/";
-  } else {
-    return temp_dir;
-  }
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/data/local/tmp/", '/');
 #else
-  return "/tmp/";
-#endif  // GTEST_OS_WINDOWS_MOBILE
+  return GetTempDirFromEnv({"TEST_TMPDIR", "TMPDIR"}, "/tmp/", '/');
+#endif
 }
 
 // Class ScopedTrace
@@ -6738,8 +6788,7 @@ void ScopedTrace::PushTrace(const char* file, int line, std::string message) {
 }
 
 // Pops the info pushed by the c'tor.
-ScopedTrace::~ScopedTrace()
-    GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
+ScopedTrace::~ScopedTrace() GTEST_LOCK_EXCLUDED_(&UnitTest::mutex_) {
   UnitTest::GetInstance()->PopGTestTrace();
 }
 
diff --git a/libvpx/third_party/googletest/src/src/gtest_main.cc b/libvpx/third_party/googletest/src/src/gtest_main.cc
index 46b27c3d7..44976375c 100644
--- a/libvpx/third_party/googletest/src/src/gtest_main.cc
+++ b/libvpx/third_party/googletest/src/src/gtest_main.cc
@@ -28,15 +28,14 @@
 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <cstdio>
+
 #include "gtest/gtest.h"
 
 #if GTEST_OS_ESP8266 || GTEST_OS_ESP32
 #if GTEST_OS_ESP8266
 extern "C" {
 #endif
-void setup() {
-  testing::InitGoogleTest();
-}
+void setup() { testing::InitGoogleTest(); }
 
 void loop() { RUN_ALL_TESTS(); }
 
diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py b/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
index 2d6a4d811..d763de856 100644
--- a/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
+++ b/libvpx/tools/3D-Reconstruction/MotionEST/Exhaust.py
@@ -83,7 +83,7 @@ class ExhaustNeighbor(MotionEST):
     self.beta = beta
     self.metric = metric
     super(ExhaustNeighbor, self).__init__(cur_f, ref_f, blk_size)
-    self.assign = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.assign = np.zeros((self.num_row, self.num_col), dtype=bool)
 
   """
     estimate neighbor loss:
diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py b/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
index 12bc53ff7..37305898a 100644
--- a/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
+++ b/libvpx/tools/3D-Reconstruction/MotionEST/GroundTruth.py
@@ -29,7 +29,7 @@ class GroundTruth(MotionEST):
   def __init__(self, cur_f, ref_f, blk_sz, gt_path, mf=None, mask=None):
     self.name = 'ground truth'
     super(GroundTruth, self).__init__(cur_f, ref_f, blk_sz)
-    self.mask = np.zeros((self.num_row, self.num_col), dtype=np.bool)
+    self.mask = np.zeros((self.num_row, self.num_col), dtype=bool)
     if gt_path:
       with open(gt_path) as gt_file:
         lines = gt_file.readlines()
@@ -42,7 +42,7 @@ class GroundTruth(MotionEST):
               self.mask[i, -j - 1] = True
               continue
             #the order of original file is flipped on the x axis
-            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=np.int)
+            self.mf[i, -j - 1] = np.array([float(y), -float(x)], dtype=int)
     else:
       self.mf = mf
       self.mask = mask
diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py b/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
index 0959530fa..fc393818d 100644
--- a/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
+++ b/libvpx/tools/3D-Reconstruction/MotionEST/MotionEST.py
@@ -28,8 +28,8 @@ class MotionEST(object):
     self.ref_f = ref_f
     self.blk_sz = blk_sz
     #convert RGB to YUV
-    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=np.int)
-    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=np.int)
+    self.cur_yuv = np.array(self.cur_f.convert('YCbCr'), dtype=int)
+    self.ref_yuv = np.array(self.ref_f.convert('YCbCr'), dtype=int)
     #frame size
     self.width = self.cur_f.size[0]
     self.height = self.cur_f.size[1]
diff --git a/libvpx/tools/3D-Reconstruction/MotionEST/Util.py b/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
index 551881cfd..c2416163b 100644
--- a/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
+++ b/libvpx/tools/3D-Reconstruction/MotionEST/Util.py
@@ -18,7 +18,7 @@ from PIL import Image, ImageDraw
 def MSE(blk1, blk2):
   return np.mean(
       LA.norm(
-          np.array(blk1, dtype=np.int) - np.array(blk2, dtype=np.int), axis=2))
+          np.array(blk1, dtype=int) - np.array(blk2, dtype=int), axis=2))
 
 
 def drawMF(img, blk_sz, mf):
diff --git a/libvpx/vp8/common/findnearmv.c b/libvpx/vp8/common/findnearmv.c
index 6889fdedd..3b3192362 100644
--- a/libvpx/vp8/common/findnearmv.c
+++ b/libvpx/vp8/common/findnearmv.c
@@ -105,9 +105,9 @@ void vp8_find_near_mvs(MACROBLOCKD *xd, const MODE_INFO *here, int_mv *nearest,
     tmp = near_mv_ref_cnts[CNT_NEAREST];
     near_mv_ref_cnts[CNT_NEAREST] = near_mv_ref_cnts[CNT_NEAR];
     near_mv_ref_cnts[CNT_NEAR] = tmp;
-    tmp = near_mvs[CNT_NEAREST].as_int;
+    tmp = (int)near_mvs[CNT_NEAREST].as_int;
     near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-    near_mvs[CNT_NEAR].as_int = tmp;
+    near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
   }
 
   /* Use near_mvs[0] to store the "best" MV */
diff --git a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
index e46827b0e..b9da52084 100644
--- a/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
+++ b/libvpx/vp8/common/mips/dspr2/filter_dspr2.c
@@ -816,8 +816,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -832,8 +832,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     __asm__ __volatile__(
         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
@@ -848,8 +848,8 @@ void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
 
         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
-        : [src_pixels_per_line] "r"(src_pixels_per_line),
-          [output_ptr] "r"(output_ptr));
+        : [src_pixels_per_line] "r"(src_pixels_per_line), [output_ptr] "r"(
+                                                              output_ptr));
 
     output_ptr += 48;
   }
diff --git a/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
index ddc881a7f..7cb3c9869 100644
--- a/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
+++ b/libvpx/vp8/common/mips/msa/vp8_macros_msa.h
@@ -69,12 +69,12 @@
 #else  // !(__mips == 64)
 #define LD(psrc)                                            \
   ({                                                        \
-    const uint8_t *psrc_m = (const uint8_t *)(psrc);        \
+    const uint8_t *psrc_ld = (const uint8_t *)(psrc);       \
     uint32_t val0_m, val1_m;                                \
     uint64_t val_m = 0;                                     \
                                                             \
-    val0_m = LW(psrc_m);                                    \
-    val1_m = LW(psrc_m + 4);                                \
+    val0_m = LW(psrc_ld);                                   \
+    val1_m = LW(psrc_ld + 4);                               \
                                                             \
     val_m = (uint64_t)(val1_m);                             \
     val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \
@@ -122,10 +122,11 @@
     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     uint32_t val_m;                                  \
                                                      \
-    asm volatile("lwr %[val_m], 0(%[psrc_m]) \n\t"   \
-                 "lwl %[val_m], 3(%[psrc_m]) \n\t"   \
-                 : [val_m] "=&r"(val_m)              \
-                 : [psrc_m] "r"(psrc_m));            \
+    asm volatile(                                    \
+        "lwr %[val_m], 0(%[psrc_m]) \n\t"            \
+        "lwl %[val_m], 3(%[psrc_m]) \n\t"            \
+        : [val_m] "=&r"(val_m)                       \
+        : [psrc_m] "r"(psrc_m));                     \
                                                      \
     val_m;                                           \
   })
@@ -136,10 +137,11 @@
     const uint8_t *psrc_m = (const uint8_t *)(psrc); \
     uint64_t val_m = 0;                              \
                                                      \
-    asm volatile("ldr %[val_m], 0(%[psrc_m]) \n\t"   \
-                 "ldl %[val_m], 7(%[psrc_m]) \n\t"   \
-                 : [val_m] "=&r"(val_m)              \
-                 : [psrc_m] "r"(psrc_m));            \
+    asm volatile(                                    \
+        "ldr %[val_m], 0(%[psrc_m]) \n\t"            \
+        "ldl %[val_m], 7(%[psrc_m]) \n\t"            \
+        : [val_m] "=&r"(val_m)                       \
+        : [psrc_m] "r"(psrc_m));                     \
                                                      \
     val_m;                                           \
   })
diff --git a/libvpx/vp8/decoder/dboolhuff.h b/libvpx/vp8/decoder/dboolhuff.h
index f2a18f0d9..673b2fbd5 100644
--- a/libvpx/vp8/decoder/dboolhuff.h
+++ b/libvpx/vp8/decoder/dboolhuff.h
@@ -15,6 +15,7 @@
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_integer.h"
@@ -50,7 +51,8 @@ int vp8dx_start_decode(BOOL_DECODER *br, const unsigned char *source,
 
 void vp8dx_bool_decoder_fill(BOOL_DECODER *br);
 
-static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) {
+static VPX_NO_UNSIGNED_SHIFT_CHECK int vp8dx_decode_bool(BOOL_DECODER *br,
+                                                         int probability) {
   unsigned int bit = 0;
   VP8_BD_VALUE value;
   unsigned int split;
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
index 51817a2cb..3f459d623 100644
--- a/libvpx/vp8/decoder/decodemv.c
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -372,9 +372,9 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi,
         tmp = cnt[CNT_NEAREST];
         cnt[CNT_NEAREST] = cnt[CNT_NEAR];
         cnt[CNT_NEAR] = tmp;
-        tmp = near_mvs[CNT_NEAREST].as_int;
+        tmp = (int)near_mvs[CNT_NEAREST].as_int;
         near_mvs[CNT_NEAREST].as_int = near_mvs[CNT_NEAR].as_int;
-        near_mvs[CNT_NEAR].as_int = tmp;
+        near_mvs[CNT_NEAR].as_int = (uint32_t)tmp;
       }
 
       if (vp8_read(bc, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
diff --git a/libvpx/vp8/decoder/onyxd_int.h b/libvpx/vp8/decoder/onyxd_int.h
index cf2c066d9..a6bedc4fa 100644
--- a/libvpx/vp8/decoder/onyxd_int.h
+++ b/libvpx/vp8/decoder/onyxd_int.h
@@ -11,6 +11,8 @@
 #ifndef VPX_VP8_DECODER_ONYXD_INT_H_
 #define VPX_VP8_DECODER_ONYXD_INT_H_
 
+#include <assert.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyxd.h"
 #include "treereader.h"
@@ -136,6 +138,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
+    assert(pbi->common.error.setjmp);                                       \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR,           \
@@ -145,6 +148,7 @@ int vp8_remove_decoder_instances(struct frame_buffers *fb);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
+    assert(pbi->common.error.setjmp);                             \
     (lval) = (expr);                                              \
     if (!(lval))                                                  \
       vpx_internal_error(&pbi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 0e97af5f2..190b013af 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -19,6 +19,7 @@
 #include <limits.h>
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/system_state.h"
 #include "bitstream.h"
 
@@ -117,7 +118,9 @@ static void write_split(vp8_writer *bc, int x) {
                   vp8_mbsplit_encodings + x);
 }
 
-void vp8_pack_tokens(vp8_writer *w, const TOKENEXTRA *p, int xcount) {
+void VPX_NO_UNSIGNED_SHIFT_CHECK vp8_pack_tokens(vp8_writer *w,
+                                                 const TOKENEXTRA *p,
+                                                 int xcount) {
   const TOKENEXTRA *stop = p + xcount;
   unsigned int split;
   int shift;
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
index c88ea1653..384bb2938 100644
--- a/libvpx/vp8/encoder/encodemv.c
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -31,17 +31,15 @@ static void encode_mvcomponent(vp8_writer *const w, const int v,
 
     vp8_write(w, 1, p[mvpis_short]);
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (++i < 3);
+    } while (++i < 3);
 
     i = mvlong_width - 1; /* Skip bit 3, which is sometimes implicit */
 
-    do
+    do {
       vp8_write(w, (x >> i) & 1, p[MVPbits + i]);
-
-    while (--i > 3);
+    } while (--i > 3);
 
     if (x & 0xFFF0) vp8_write(w, (x >> 3) & 1, p[MVPbits + 3]);
   }
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index ed177e3cb..65d2681c9 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -903,9 +903,9 @@ static double calc_correction_factor(double err_per_mb, double err_devisor,
   correction_factor = pow(error_term, power_term);
 
   /* Clip range */
-  correction_factor = (correction_factor < 0.05)
-                          ? 0.05
-                          : (correction_factor > 5.0) ? 5.0 : correction_factor;
+  correction_factor = (correction_factor < 0.05)  ? 0.05
+                      : (correction_factor > 5.0) ? 5.0
+                                                  : correction_factor;
 
   return correction_factor;
 }
@@ -947,11 +947,10 @@ static int estimate_max_q(VP8_COMP *cpi, FIRSTPASS_STATS *fpstats,
     }
 
     cpi->twopass.est_max_qcorrection_factor =
-        (cpi->twopass.est_max_qcorrection_factor < 0.1)
-            ? 0.1
-            : (cpi->twopass.est_max_qcorrection_factor > 10.0)
-                  ? 10.0
-                  : cpi->twopass.est_max_qcorrection_factor;
+        (cpi->twopass.est_max_qcorrection_factor < 0.1) ? 0.1
+        : (cpi->twopass.est_max_qcorrection_factor > 10.0)
+            ? 10.0
+            : cpi->twopass.est_max_qcorrection_factor;
   }
 
   /* Corrections for higher compression speed settings
@@ -1178,10 +1177,9 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err,
   } else {
     current_spend_ratio = (double)cpi->long_rolling_actual_bits /
                           (double)cpi->long_rolling_target_bits;
-    current_spend_ratio =
-        (current_spend_ratio > 10.0)
-            ? 10.0
-            : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    current_spend_ratio = (current_spend_ratio > 10.0)  ? 10.0
+                          : (current_spend_ratio < 0.1) ? 0.1
+                                                        : current_spend_ratio;
   }
 
   /* Calculate a correction factor based on the quality of prediction in
@@ -1968,11 +1966,10 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   }
 
   cpi->twopass.gf_group_bits =
-      (cpi->twopass.gf_group_bits < 0)
-          ? 0
-          : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
-                ? cpi->twopass.kf_group_bits
-                : cpi->twopass.gf_group_bits;
+      (cpi->twopass.gf_group_bits < 0) ? 0
+      : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits)
+          ? cpi->twopass.kf_group_bits
+          : cpi->twopass.gf_group_bits;
 
   /* Clip cpi->twopass.gf_group_bits based on user supplied data rate
    * variability limit (cpi->oxcf.two_pass_vbrmax_section)
diff --git a/libvpx/vp8/encoder/loongarch/quantize_lsx.c b/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c
index 75889192a..75889192a 100644
--- a/libvpx/vp8/encoder/loongarch/quantize_lsx.c
+++ b/libvpx/vp8/encoder/loongarch/vp8_quantize_lsx.c
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index ae092c66e..b92e2135e 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -204,20 +204,21 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) {
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                             \
-  do {                                                    \
-    IFMVCV(r, c,                                          \
-           {                                              \
-             thismse = DIST(r, c);                        \
-             if ((v = (MVC(r, c) + thismse)) < besterr) { \
-               besterr = v;                               \
-               br = r;                                    \
-               bc = c;                                    \
-               *distortion = thismse;                     \
-               *sse1 = sse;                               \
-             }                                            \
-           },                                             \
-           v = UINT_MAX;)                                 \
+#define CHECK_BETTER(v, r, c)                          \
+  do {                                                 \
+    IFMVCV(                                            \
+        r, c,                                          \
+        {                                              \
+          thismse = DIST(r, c);                        \
+          if ((v = (MVC(r, c) + thismse)) < besterr) { \
+            besterr = v;                               \
+            br = r;                                    \
+            bc = c;                                    \
+            *distortion = thismse;                     \
+            *sse1 = sse;                               \
+          }                                            \
+        },                                             \
+        v = UINT_MAX;)                                 \
   } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index ffb3867dd..4bbeadef0 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -328,8 +328,8 @@ void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
 // for any "new" layers. For "existing" layers, let them inherit the parameters
 // from the previous layer state (at the same layer #). In future we may want
 // to better map the previous layer state(s) to the "new" ones.
-static void reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int prev_num_layers) {
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int prev_num_layers) {
   int i;
   double prev_layer_framerate = 0;
   const int curr_num_layers = cpi->oxcf.number_of_layers;
@@ -1643,7 +1643,7 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) {
       cpi->temporal_layer_id = 0;
     }
     cpi->temporal_pattern_counter = 0;
-    reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
+    vp8_reset_temporal_layer_change(cpi, oxcf, prev_number_of_layers);
   }
 
   if (!cpi->initial_width) {
@@ -4202,11 +4202,10 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size,
       }
 
       /* Clamp cpi->zbin_over_quant */
-      cpi->mb.zbin_over_quant = (cpi->mb.zbin_over_quant < zbin_oq_low)
-                                    ? zbin_oq_low
-                                    : (cpi->mb.zbin_over_quant > zbin_oq_high)
-                                          ? zbin_oq_high
-                                          : cpi->mb.zbin_over_quant;
+      cpi->mb.zbin_over_quant =
+          (cpi->mb.zbin_over_quant < zbin_oq_low)    ? zbin_oq_low
+          : (cpi->mb.zbin_over_quant > zbin_oq_high) ? zbin_oq_high
+                                                     : cpi->mb.zbin_over_quant;
 
       Loop = Q != last_q;
     } else {
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index 424f51b18..46a17913a 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -11,7 +11,9 @@
 #ifndef VPX_VP8_ENCODER_ONYX_INT_H_
 #define VPX_VP8_ENCODER_ONYX_INT_H_
 
+#include <assert.h>
 #include <stdio.h>
+
 #include "vpx_config.h"
 #include "vp8/common/onyx.h"
 #include "treewriter.h"
@@ -483,7 +485,7 @@ typedef struct VP8_COMP {
 
   unsigned char *segmentation_map;
   signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
-  int segment_encode_breakout[MAX_MB_SEGMENTS];
+  unsigned int segment_encode_breakout[MAX_MB_SEGMENTS];
 
   unsigned char *active_map;
   unsigned int active_map_enabled;
@@ -711,6 +713,8 @@ void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_reset_temporal_layer_change(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int prev_num_layers);
 void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
                                      const int layer,
                                      double prev_layer_framerate);
@@ -730,6 +734,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(lval, expr)                                         \
   do {                                                                      \
+    assert(cpi->common.error.setjmp);                                       \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,           \
@@ -739,6 +744,7 @@ void vp8_set_speed_features(VP8_COMP *cpi);
 #else
 #define CHECK_MEM_ERROR(lval, expr)                               \
   do {                                                            \
+    assert(cpi->common.error.setjmp);                             \
     (lval) = (expr);                                              \
     if (!(lval))                                                  \
       vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, \
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 5821fc734..bbddacf8f 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -1608,7 +1608,7 @@ static int evaluate_inter_mode_rd(int mdcounts[4], RATE_DISTORTION *rd,
       unsigned int q2dc = xd->block[24].dequant[0];
       /* If theres is no codeable 2nd order dc
          or a very small uniform pixel change change */
-      if ((sse - var<q2dc * q2dc>> 4) || (sse / 2 > var && sse - var < 64)) {
+      if ((sse - var < q2dc * q2dc >> 4) || (sse / 2 > var && sse - var < 64)) {
         /* Check u and v to make sure skip is ok */
         unsigned int sse2 = VP8_UVSSE(x);
         if (sse2 * 2 < threshold) {
diff --git a/libvpx/vp8/encoder/x86/denoising_sse2.c b/libvpx/vp8/encoder/x86/denoising_sse2.c
index 89cad5335..f35b93016 100644
--- a/libvpx/vp8/encoder/x86/denoising_sse2.c
+++ b/libvpx/vp8/encoder/x86/denoising_sse2.c
@@ -30,7 +30,7 @@ static INLINE unsigned int abs_sum_diff_16x1(__m128i acc_diff) {
       _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
   const __m128i hgfedcba =
       _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
-  unsigned int sum_diff = abs(_mm_cvtsi128_si32(hgfedcba));
+  unsigned int sum_diff = (unsigned int)abs(_mm_cvtsi128_si32(hgfedcba));
 
   return sum_diff;
 }
diff --git a/libvpx/vp8/encoder/x86/quantize_sse4.c b/libvpx/vp8/encoder/x86/quantize_sse4.c
index 6d03365fc..4c2d24cc2 100644
--- a/libvpx/vp8/encoder/x86/quantize_sse4.c
+++ b/libvpx/vp8/encoder/x86/quantize_sse4.c
@@ -13,8 +13,11 @@
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
 #include "vpx_ports/bitops.h" /* get_lsb */
+#include "vpx_ports/compiler_attributes.h"
 
-void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
+// Unsigned shift overflow is disabled for the use of ~1U << eob with ymask.
+VPX_NO_UNSIGNED_SHIFT_CHECK void vp8_regular_quantize_b_sse4_1(BLOCK *b,
+                                                               BLOCKD *d) {
   int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
   __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index 6d88e5154..55a77ba7e 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -275,7 +275,7 @@ static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t *ctx,
                                   void *user_priv, long deadline) {
   volatile vpx_codec_err_t res;
   volatile unsigned int resolution_change = 0;
-  unsigned int w, h;
+  volatile unsigned int w, h;
 
   if (!ctx->fragments.enabled && (data == NULL && data_sz == 0)) {
     return 0;
diff --git a/libvpx/vp8/vp8_ratectrl_rtc.cc b/libvpx/vp8/vp8_ratectrl_rtc.cc
index 2f23c5b1d..f3f42529d 100644
--- a/libvpx/vp8/vp8_ratectrl_rtc.cc
+++ b/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -92,6 +92,7 @@ void VP8RateControlRTC::UpdateRateControl(
     const VP8RateControlRtcConfig &rc_cfg) {
   VP8_COMMON *cm = &cpi_->common;
   VP8_CONFIG *oxcf = &cpi_->oxcf;
+  const unsigned int prev_number_of_layers = oxcf->number_of_layers;
   vpx_clear_system_state();
   cm->Width = rc_cfg.width;
   cm->Height = rc_cfg.height;
@@ -124,17 +125,33 @@ void VP8RateControlRTC::UpdateRateControl(
         static_cast<int>(cpi_->output_framerate);
   }
 
-  if (oxcf->number_of_layers > 1) {
+  if (oxcf->number_of_layers > 1 || prev_number_of_layers > 1) {
     memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
            sizeof(rc_cfg.layer_target_bitrate));
     memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
            sizeof(rc_cfg.ts_rate_decimator));
-    oxcf->periodicity = 2;
+    if (cm->current_video_frame == 0) {
+      double prev_layer_framerate = 0;
+      for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+        vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+        prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      }
+    } else if (oxcf->number_of_layers != prev_number_of_layers) {
+      // The number of temporal layers has changed, so reset/initialize the
+      // temporal layer context for the new layer configuration: this means
+      // calling vp8_reset_temporal_layer_change() below.
+
+      // Start at the base of the pattern cycle, so set the layer id to 0 and
+      // reset the temporal pattern counter.
+      // TODO(marpan/jianj): don't think lines 148-151 are needed (user controls
+      // the layer_id) so remove.
+      if (cpi_->temporal_layer_id > 0) {
+        cpi_->temporal_layer_id = 0;
+      }
+      cpi_->temporal_pattern_counter = 0;
 
-    double prev_layer_framerate = 0;
-    for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
-      vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
-      prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+      vp8_reset_temporal_layer_change(cpi_, oxcf,
+                                      static_cast<int>(prev_number_of_layers));
     }
   }
 
@@ -146,20 +163,24 @@ void VP8RateControlRTC::UpdateRateControl(
   cm->MBs = cm->mb_rows * cm->mb_cols;
   cm->mode_info_stride = cm->mb_cols + 1;
 
-  oxcf->starting_buffer_level =
-      rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
-  /* Set or reset optimal and maximum buffer levels. */
-  if (oxcf->optimal_buffer_level == 0) {
-    oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
-  } else {
-    oxcf->optimal_buffer_level =
-        rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
-  }
-  if (oxcf->maximum_buffer_size == 0) {
-    oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
-  } else {
-    oxcf->maximum_buffer_size =
-        rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+  // For temporal layers: starting/maximum/optimal_buffer_level is already set
+  // via vp8_init_temporal_layer_context() or vp8_reset_temporal_layer_change().
+  if (oxcf->number_of_layers <= 1 && prev_number_of_layers <= 1) {
+    oxcf->starting_buffer_level =
+        rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+    /* Set or reset optimal and maximum buffer levels. */
+    if (oxcf->optimal_buffer_level == 0) {
+      oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->optimal_buffer_level = rescale((int)oxcf->optimal_buffer_level,
+                                           oxcf->target_bandwidth, 1000);
+    }
+    if (oxcf->maximum_buffer_size == 0) {
+      oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+    } else {
+      oxcf->maximum_buffer_size =
+          rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+    }
   }
 
   if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
index 5744cbabc..b4b3fda9e 100644
--- a/libvpx/vp8/vp8cx.mk
+++ b/libvpx/vp8/vp8cx.mk
@@ -125,8 +125,8 @@ VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
 # common (loongarch LSX intrinsics)
-VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c
 VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
 VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/vp8_quantize_lsx.c
 
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index 3cec53bfd..8d2bed38e 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -49,6 +49,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #if CONFIG_DEBUG
 #define CHECK_MEM_ERROR(cm, lval, expr)                                     \
   do {                                                                      \
+    assert(&(cm)->error.setjmp);                                            \
     (lval) = (expr);                                                        \
     if (!(lval))                                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR,                 \
@@ -58,6 +59,7 @@ static INLINE int get_unsigned_bits(unsigned int num_values) {
 #else
 #define CHECK_MEM_ERROR(cm, lval, expr)                     \
   do {                                                      \
+    assert(&(cm)->error.setjmp);                            \
     (lval) = (expr);                                        \
     if (!(lval))                                            \
       vpx_internal_error(&(cm)->error, VPX_CODEC_MEM_ERROR, \
diff --git a/libvpx/vp9/common/vp9_loopfilter.c b/libvpx/vp9/common/vp9_loopfilter.c
index 95d6029f3..765cb1172 100644
--- a/libvpx/vp9/common/vp9_loopfilter.c
+++ b/libvpx/vp9/common/vp9_loopfilter.c
@@ -1180,7 +1180,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
     }
 
     // Disable filtering on the leftmost column
-    border_mask = ~(mi_col == 0 ? 1 : 0);
+    border_mask = ~(mi_col == 0 ? 1u : 0u);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_vert(
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index 4da0b6675..f4bd9772c 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -129,10 +129,10 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
 add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 ssse3 avx2 vsx/;
 
 add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp_32x32 neon ssse3 avx2 vsx/;
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vp9_block_error avx2 sse2/;
@@ -175,7 +175,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
 # Motion search
 #
 add_proto qw/int vp9_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
-specialize qw/vp9_diamond_search_sad avx/;
+specialize qw/vp9_diamond_search_sad avx neon/;
 
 #
 # Apply temporal filter
@@ -196,11 +196,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # ENCODEMB INVOKE
 
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_highbd_quantize_fp avx2 neon/;
 
   add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  specialize qw/vp9_highbd_quantize_fp_32x32 avx2 neon/;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_highbd_fht4x4 neon/;
 
   add_proto qw/void vp9_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
 
diff --git a/libvpx/vp9/common/vp9_scan.c b/libvpx/vp9/common/vp9_scan.c
index 0fef26351..8bea61dea 100644
--- a/libvpx/vp9/common/vp9_scan.c
+++ b/libvpx/vp9/common/vp9_scan.c
@@ -511,180 +511,181 @@ DECLARE_ALIGNED(16, static const int16_t,
   959, 990,  991, 1022, 0,   0,
 };
 
+// Add 1 to iscan values. This represents the EOB position instead of the index.
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_4x4[16]) = {
-  0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
+  1, 3, 6, 9, 2, 4, 10, 13, 5, 8, 12, 15, 7, 11, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_4x4[16]) = {
-  0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
+  1, 4, 8, 12, 2, 6, 10, 13, 3, 7, 11, 15, 5, 9, 14, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_4x4[16]) = {
-  0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
+  1, 2, 4, 6, 3, 5, 7, 10, 8, 9, 12, 14, 11, 13, 15, 16,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_8x8[64]) = {
-  0,  3,  8,  15, 22, 32, 40, 47, 1,  5,  11, 18, 26, 34, 44, 51,
-  2,  7,  13, 20, 28, 38, 46, 54, 4,  10, 16, 24, 31, 41, 50, 56,
-  6,  12, 21, 27, 35, 43, 52, 58, 9,  17, 25, 33, 39, 48, 55, 60,
-  14, 23, 30, 37, 45, 53, 59, 62, 19, 29, 36, 42, 49, 57, 61, 63,
+  1,  4,  9,  16, 23, 33, 41, 48, 2,  6,  12, 19, 27, 35, 45, 52,
+  3,  8,  14, 21, 29, 39, 47, 55, 5,  11, 17, 25, 32, 42, 51, 57,
+  7,  13, 22, 28, 36, 44, 53, 59, 10, 18, 26, 34, 40, 49, 56, 61,
+  15, 24, 31, 38, 46, 54, 60, 63, 20, 30, 37, 43, 50, 58, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_8x8[64]) = {
-  0,  1,  2,  5,  8,  12, 19, 24, 3,  4,  7,  10, 15, 20, 30, 39,
-  6,  9,  13, 16, 21, 27, 37, 46, 11, 14, 17, 23, 28, 34, 44, 52,
-  18, 22, 25, 31, 35, 41, 50, 57, 26, 29, 33, 38, 43, 49, 55, 59,
-  32, 36, 42, 47, 51, 54, 60, 61, 40, 45, 48, 53, 56, 58, 62, 63,
+  1,  2,  3,  6,  9,  13, 20, 25, 4,  5,  8,  11, 16, 21, 31, 40,
+  7,  10, 14, 17, 22, 28, 38, 47, 12, 15, 18, 24, 29, 35, 45, 53,
+  19, 23, 26, 32, 36, 42, 51, 58, 27, 30, 34, 39, 44, 50, 56, 60,
+  33, 37, 43, 48, 52, 55, 61, 62, 41, 46, 49, 54, 57, 59, 63, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_8x8[64]) = {
-  0,  2,  5,  9,  14, 22, 31, 37, 1,  4,  8,  13, 19, 26, 38, 44,
-  3,  6,  10, 17, 24, 30, 42, 49, 7,  11, 15, 21, 29, 36, 47, 53,
-  12, 16, 20, 27, 34, 43, 52, 57, 18, 23, 28, 35, 41, 48, 56, 60,
-  25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
+  1,  3,  6,  10, 15, 23, 32, 38, 2,  5,  9,  14, 20, 27, 39, 45,
+  4,  7,  11, 18, 25, 31, 43, 50, 8,  12, 16, 22, 30, 37, 48, 54,
+  13, 17, 21, 28, 35, 44, 53, 58, 19, 24, 29, 36, 42, 49, 57, 61,
+  26, 33, 40, 46, 51, 56, 60, 63, 34, 41, 47, 52, 55, 59, 62, 64,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_col_iscan_16x16[256]) = {
-  0,  4,  11,  20,  31,  43,  59,  75,  85,  109, 130, 150, 165, 181, 195, 198,
-  1,  6,  14,  23,  34,  47,  64,  81,  95,  114, 135, 153, 171, 188, 201, 212,
-  2,  8,  16,  25,  38,  52,  67,  83,  101, 116, 136, 157, 172, 190, 205, 216,
-  3,  10, 18,  29,  41,  55,  71,  89,  103, 119, 141, 159, 176, 194, 208, 218,
-  5,  12, 21,  32,  45,  58,  74,  93,  104, 123, 144, 164, 179, 196, 210, 223,
-  7,  15, 26,  37,  49,  63,  78,  96,  112, 129, 146, 166, 182, 200, 215, 228,
-  9,  19, 28,  39,  54,  69,  86,  102, 117, 132, 151, 170, 187, 206, 220, 230,
-  13, 24, 35,  46,  60,  73,  91,  108, 122, 137, 154, 174, 189, 207, 224, 235,
-  17, 30, 40,  53,  66,  82,  98,  115, 126, 142, 161, 180, 197, 213, 227, 237,
-  22, 36, 48,  62,  76,  92,  105, 120, 133, 147, 167, 186, 203, 219, 232, 240,
-  27, 44, 56,  70,  84,  99,  113, 127, 140, 156, 175, 193, 209, 226, 236, 244,
-  33, 51, 68,  79,  94,  110, 125, 138, 149, 162, 184, 202, 217, 229, 241, 247,
-  42, 61, 77,  90,  106, 121, 134, 148, 160, 173, 191, 211, 225, 238, 245, 251,
-  50, 72, 87,  100, 118, 128, 145, 158, 168, 183, 204, 222, 233, 242, 249, 253,
-  57, 80, 97,  111, 131, 143, 155, 169, 178, 192, 214, 231, 239, 246, 250, 254,
-  65, 88, 107, 124, 139, 152, 163, 177, 185, 199, 221, 234, 243, 248, 252, 255,
+  1,  5,  12,  21,  32,  44,  60,  76,  86,  110, 131, 151, 166, 182, 196, 199,
+  2,  7,  15,  24,  35,  48,  65,  82,  96,  115, 136, 154, 172, 189, 202, 213,
+  3,  9,  17,  26,  39,  53,  68,  84,  102, 117, 137, 158, 173, 191, 206, 217,
+  4,  11, 19,  30,  42,  56,  72,  90,  104, 120, 142, 160, 177, 195, 209, 219,
+  6,  13, 22,  33,  46,  59,  75,  94,  105, 124, 145, 165, 180, 197, 211, 224,
+  8,  16, 27,  38,  50,  64,  79,  97,  113, 130, 147, 167, 183, 201, 216, 229,
+  10, 20, 29,  40,  55,  70,  87,  103, 118, 133, 152, 171, 188, 207, 221, 231,
+  14, 25, 36,  47,  61,  74,  92,  109, 123, 138, 155, 175, 190, 208, 225, 236,
+  18, 31, 41,  54,  67,  83,  99,  116, 127, 143, 162, 181, 198, 214, 228, 238,
+  23, 37, 49,  63,  77,  93,  106, 121, 134, 148, 168, 187, 204, 220, 233, 241,
+  28, 45, 57,  71,  85,  100, 114, 128, 141, 157, 176, 194, 210, 227, 237, 245,
+  34, 52, 69,  80,  95,  111, 126, 139, 150, 163, 185, 203, 218, 230, 242, 248,
+  43, 62, 78,  91,  107, 122, 135, 149, 161, 174, 192, 212, 226, 239, 246, 252,
+  51, 73, 88,  101, 119, 129, 146, 159, 169, 184, 205, 223, 234, 243, 250, 254,
+  58, 81, 98,  112, 132, 144, 156, 170, 179, 193, 215, 232, 240, 247, 251, 255,
+  66, 89, 108, 125, 140, 153, 164, 178, 186, 200, 222, 235, 244, 249, 253, 256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_row_iscan_16x16[256]) = {
-  0,   1,   2,   4,   6,   9,   12,  17,  22,  29,  36,  43,  54,  64,  76,
-  86,  3,   5,   7,   11,  15,  19,  25,  32,  38,  48,  59,  68,  84,  99,
-  115, 130, 8,   10,  13,  18,  23,  27,  33,  42,  51,  60,  72,  88,  103,
-  119, 142, 167, 14,  16,  20,  26,  31,  37,  44,  53,  61,  73,  85,  100,
-  116, 135, 161, 185, 21,  24,  30,  35,  40,  47,  55,  65,  74,  81,  94,
-  112, 133, 154, 179, 205, 28,  34,  39,  45,  50,  58,  67,  77,  87,  96,
-  106, 121, 146, 169, 196, 212, 41,  46,  49,  56,  63,  70,  79,  90,  98,
-  107, 122, 138, 159, 182, 207, 222, 52,  57,  62,  69,  75,  83,  93,  102,
-  110, 120, 134, 150, 176, 195, 215, 226, 66,  71,  78,  82,  91,  97,  108,
-  113, 127, 136, 148, 168, 188, 202, 221, 232, 80,  89,  92,  101, 105, 114,
-  125, 131, 139, 151, 162, 177, 192, 208, 223, 234, 95,  104, 109, 117, 123,
-  128, 143, 144, 155, 165, 175, 190, 206, 219, 233, 239, 111, 118, 124, 129,
-  140, 147, 157, 164, 170, 181, 191, 203, 224, 230, 240, 243, 126, 132, 137,
-  145, 153, 160, 174, 178, 184, 197, 204, 216, 231, 237, 244, 246, 141, 149,
-  156, 166, 172, 180, 189, 199, 200, 210, 220, 228, 238, 242, 249, 251, 152,
-  163, 171, 183, 186, 193, 201, 211, 214, 218, 227, 236, 245, 247, 252, 253,
-  158, 173, 187, 194, 198, 209, 213, 217, 225, 229, 235, 241, 248, 250, 254,
-  255,
+  1,   2,   3,   5,   7,   10,  13,  18,  23,  30,  37,  44,  55,  65,  77,
+  87,  4,   6,   8,   12,  16,  20,  26,  33,  39,  49,  60,  69,  85,  100,
+  116, 131, 9,   11,  14,  19,  24,  28,  34,  43,  52,  61,  73,  89,  104,
+  120, 143, 168, 15,  17,  21,  27,  32,  38,  45,  54,  62,  74,  86,  101,
+  117, 136, 162, 186, 22,  25,  31,  36,  41,  48,  56,  66,  75,  82,  95,
+  113, 134, 155, 180, 206, 29,  35,  40,  46,  51,  59,  68,  78,  88,  97,
+  107, 122, 147, 170, 197, 213, 42,  47,  50,  57,  64,  71,  80,  91,  99,
+  108, 123, 139, 160, 183, 208, 223, 53,  58,  63,  70,  76,  84,  94,  103,
+  111, 121, 135, 151, 177, 196, 216, 227, 67,  72,  79,  83,  92,  98,  109,
+  114, 128, 137, 149, 169, 189, 203, 222, 233, 81,  90,  93,  102, 106, 115,
+  126, 132, 140, 152, 163, 178, 193, 209, 224, 235, 96,  105, 110, 118, 124,
+  129, 144, 145, 156, 166, 176, 191, 207, 220, 234, 240, 112, 119, 125, 130,
+  141, 148, 158, 165, 171, 182, 192, 204, 225, 231, 241, 244, 127, 133, 138,
+  146, 154, 161, 175, 179, 185, 198, 205, 217, 232, 238, 245, 247, 142, 150,
+  157, 167, 173, 181, 190, 200, 201, 211, 221, 229, 239, 243, 250, 252, 153,
+  164, 172, 184, 187, 194, 202, 212, 215, 219, 228, 237, 246, 248, 253, 254,
+  159, 174, 188, 195, 199, 210, 214, 218, 226, 230, 236, 242, 249, 251, 255,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_16x16[256]) = {
-  0,   2,   5,   9,   17,  24,  36,  44,  55,  72,  88,  104, 128, 143, 166,
-  179, 1,   4,   8,   13,  20,  30,  40,  54,  66,  79,  96,  113, 141, 154,
-  178, 196, 3,   7,   11,  18,  25,  33,  46,  57,  71,  86,  101, 119, 148,
-  164, 186, 201, 6,   12,  16,  23,  31,  39,  53,  64,  78,  92,  110, 127,
-  153, 169, 193, 208, 10,  14,  19,  28,  37,  47,  58,  67,  84,  98,  114,
-  133, 161, 176, 198, 214, 15,  21,  26,  34,  43,  52,  65,  77,  91,  106,
-  120, 140, 165, 185, 205, 221, 22,  27,  32,  41,  48,  60,  73,  85,  99,
-  116, 130, 151, 175, 190, 211, 225, 29,  35,  42,  49,  59,  69,  81,  95,
-  108, 125, 139, 155, 182, 197, 217, 229, 38,  45,  51,  61,  68,  80,  93,
-  105, 118, 134, 150, 168, 191, 207, 223, 234, 50,  56,  63,  74,  83,  94,
-  109, 117, 129, 147, 163, 177, 199, 213, 228, 238, 62,  70,  76,  87,  97,
-  107, 122, 131, 145, 159, 172, 188, 210, 222, 235, 242, 75,  82,  90,  102,
-  112, 124, 138, 146, 157, 173, 187, 202, 219, 230, 240, 245, 89,  100, 111,
-  123, 132, 142, 156, 167, 180, 189, 203, 216, 231, 237, 246, 250, 103, 115,
-  126, 136, 149, 162, 171, 183, 194, 204, 215, 224, 236, 241, 248, 252, 121,
-  135, 144, 158, 170, 181, 192, 200, 209, 218, 227, 233, 243, 244, 251, 254,
-  137, 152, 160, 174, 184, 195, 206, 212, 220, 226, 232, 239, 247, 249, 253,
-  255,
+  1,   3,   6,   10,  18,  25,  37,  45,  56,  73,  89,  105, 129, 144, 167,
+  180, 2,   5,   9,   14,  21,  31,  41,  55,  67,  80,  97,  114, 142, 155,
+  179, 197, 4,   8,   12,  19,  26,  34,  47,  58,  72,  87,  102, 120, 149,
+  165, 187, 202, 7,   13,  17,  24,  32,  40,  54,  65,  79,  93,  111, 128,
+  154, 170, 194, 209, 11,  15,  20,  29,  38,  48,  59,  68,  85,  99,  115,
+  134, 162, 177, 199, 215, 16,  22,  27,  35,  44,  53,  66,  78,  92,  107,
+  121, 141, 166, 186, 206, 222, 23,  28,  33,  42,  49,  61,  74,  86,  100,
+  117, 131, 152, 176, 191, 212, 226, 30,  36,  43,  50,  60,  70,  82,  96,
+  109, 126, 140, 156, 183, 198, 218, 230, 39,  46,  52,  62,  69,  81,  94,
+  106, 119, 135, 151, 169, 192, 208, 224, 235, 51,  57,  64,  75,  84,  95,
+  110, 118, 130, 148, 164, 178, 200, 214, 229, 239, 63,  71,  77,  88,  98,
+  108, 123, 132, 146, 160, 173, 189, 211, 223, 236, 243, 76,  83,  91,  103,
+  113, 125, 139, 147, 158, 174, 188, 203, 220, 231, 241, 246, 90,  101, 112,
+  124, 133, 143, 157, 168, 181, 190, 204, 217, 232, 238, 247, 251, 104, 116,
+  127, 137, 150, 163, 172, 184, 195, 205, 216, 225, 237, 242, 249, 253, 122,
+  136, 145, 159, 171, 182, 193, 201, 210, 219, 228, 234, 244, 245, 252, 255,
+  138, 153, 161, 175, 185, 196, 207, 213, 221, 227, 233, 240, 248, 250, 254,
+  256,
 };
 
 DECLARE_ALIGNED(16, static const int16_t, vp9_default_iscan_32x32[1024]) = {
-  0,    2,    5,    10,   17,   25,   38,   47,   62,   83,   101,  121,  145,
-  170,  193,  204,  210,  219,  229,  233,  245,  257,  275,  299,  342,  356,
-  377,  405,  455,  471,  495,  527,  1,    4,    8,    15,   22,   30,   45,
-  58,   74,   92,   112,  133,  158,  184,  203,  215,  222,  228,  234,  237,
-  256,  274,  298,  317,  355,  376,  404,  426,  470,  494,  526,  551,  3,
-  7,    12,   18,   28,   36,   52,   64,   82,   102,  118,  142,  164,  189,
-  208,  217,  224,  231,  235,  238,  273,  297,  316,  329,  375,  403,  425,
-  440,  493,  525,  550,  567,  6,    11,   16,   23,   31,   43,   60,   73,
-  90,   109,  126,  150,  173,  196,  211,  220,  226,  232,  236,  239,  296,
-  315,  328,  335,  402,  424,  439,  447,  524,  549,  566,  575,  9,    14,
-  19,   29,   37,   50,   65,   78,   95,   116,  134,  157,  179,  201,  214,
-  223,  244,  255,  272,  295,  341,  354,  374,  401,  454,  469,  492,  523,
-  582,  596,  617,  645,  13,   20,   26,   35,   44,   54,   72,   85,   105,
-  123,  140,  163,  182,  205,  216,  225,  254,  271,  294,  314,  353,  373,
-  400,  423,  468,  491,  522,  548,  595,  616,  644,  666,  21,   27,   33,
-  42,   53,   63,   80,   94,   113,  132,  151,  172,  190,  209,  218,  227,
-  270,  293,  313,  327,  372,  399,  422,  438,  490,  521,  547,  565,  615,
-  643,  665,  680,  24,   32,   39,   48,   57,   71,   88,   104,  120,  139,
-  159,  178,  197,  212,  221,  230,  292,  312,  326,  334,  398,  421,  437,
-  446,  520,  546,  564,  574,  642,  664,  679,  687,  34,   40,   46,   56,
-  68,   81,   96,   111,  130,  147,  167,  186,  243,  253,  269,  291,  340,
-  352,  371,  397,  453,  467,  489,  519,  581,  594,  614,  641,  693,  705,
-  723,  747,  41,   49,   55,   67,   77,   91,   107,  124,  138,  161,  177,
-  194,  252,  268,  290,  311,  351,  370,  396,  420,  466,  488,  518,  545,
-  593,  613,  640,  663,  704,  722,  746,  765,  51,   59,   66,   76,   89,
-  99,   119,  131,  149,  168,  181,  200,  267,  289,  310,  325,  369,  395,
-  419,  436,  487,  517,  544,  563,  612,  639,  662,  678,  721,  745,  764,
-  777,  61,   69,   75,   87,   100,  114,  129,  144,  162,  180,  191,  207,
-  288,  309,  324,  333,  394,  418,  435,  445,  516,  543,  562,  573,  638,
-  661,  677,  686,  744,  763,  776,  783,  70,   79,   86,   97,   108,  122,
-  137,  155,  242,  251,  266,  287,  339,  350,  368,  393,  452,  465,  486,
-  515,  580,  592,  611,  637,  692,  703,  720,  743,  788,  798,  813,  833,
-  84,   93,   103,  110,  125,  141,  154,  171,  250,  265,  286,  308,  349,
-  367,  392,  417,  464,  485,  514,  542,  591,  610,  636,  660,  702,  719,
-  742,  762,  797,  812,  832,  848,  98,   106,  115,  127,  143,  156,  169,
-  185,  264,  285,  307,  323,  366,  391,  416,  434,  484,  513,  541,  561,
-  609,  635,  659,  676,  718,  741,  761,  775,  811,  831,  847,  858,  117,
-  128,  136,  148,  160,  175,  188,  198,  284,  306,  322,  332,  390,  415,
-  433,  444,  512,  540,  560,  572,  634,  658,  675,  685,  740,  760,  774,
-  782,  830,  846,  857,  863,  135,  146,  152,  165,  241,  249,  263,  283,
-  338,  348,  365,  389,  451,  463,  483,  511,  579,  590,  608,  633,  691,
-  701,  717,  739,  787,  796,  810,  829,  867,  875,  887,  903,  153,  166,
-  174,  183,  248,  262,  282,  305,  347,  364,  388,  414,  462,  482,  510,
-  539,  589,  607,  632,  657,  700,  716,  738,  759,  795,  809,  828,  845,
-  874,  886,  902,  915,  176,  187,  195,  202,  261,  281,  304,  321,  363,
-  387,  413,  432,  481,  509,  538,  559,  606,  631,  656,  674,  715,  737,
-  758,  773,  808,  827,  844,  856,  885,  901,  914,  923,  192,  199,  206,
-  213,  280,  303,  320,  331,  386,  412,  431,  443,  508,  537,  558,  571,
-  630,  655,  673,  684,  736,  757,  772,  781,  826,  843,  855,  862,  900,
-  913,  922,  927,  240,  247,  260,  279,  337,  346,  362,  385,  450,  461,
-  480,  507,  578,  588,  605,  629,  690,  699,  714,  735,  786,  794,  807,
-  825,  866,  873,  884,  899,  930,  936,  945,  957,  246,  259,  278,  302,
-  345,  361,  384,  411,  460,  479,  506,  536,  587,  604,  628,  654,  698,
-  713,  734,  756,  793,  806,  824,  842,  872,  883,  898,  912,  935,  944,
-  956,  966,  258,  277,  301,  319,  360,  383,  410,  430,  478,  505,  535,
-  557,  603,  627,  653,  672,  712,  733,  755,  771,  805,  823,  841,  854,
-  882,  897,  911,  921,  943,  955,  965,  972,  276,  300,  318,  330,  382,
-  409,  429,  442,  504,  534,  556,  570,  626,  652,  671,  683,  732,  754,
-  770,  780,  822,  840,  853,  861,  896,  910,  920,  926,  954,  964,  971,
-  975,  336,  344,  359,  381,  449,  459,  477,  503,  577,  586,  602,  625,
-  689,  697,  711,  731,  785,  792,  804,  821,  865,  871,  881,  895,  929,
-  934,  942,  953,  977,  981,  987,  995,  343,  358,  380,  408,  458,  476,
-  502,  533,  585,  601,  624,  651,  696,  710,  730,  753,  791,  803,  820,
-  839,  870,  880,  894,  909,  933,  941,  952,  963,  980,  986,  994,  1001,
-  357,  379,  407,  428,  475,  501,  532,  555,  600,  623,  650,  670,  709,
-  729,  752,  769,  802,  819,  838,  852,  879,  893,  908,  919,  940,  951,
-  962,  970,  985,  993,  1000, 1005, 378,  406,  427,  441,  500,  531,  554,
-  569,  622,  649,  669,  682,  728,  751,  768,  779,  818,  837,  851,  860,
-  892,  907,  918,  925,  950,  961,  969,  974,  992,  999,  1004, 1007, 448,
-  457,  474,  499,  576,  584,  599,  621,  688,  695,  708,  727,  784,  790,
-  801,  817,  864,  869,  878,  891,  928,  932,  939,  949,  976,  979,  984,
-  991,  1008, 1010, 1013, 1017, 456,  473,  498,  530,  583,  598,  620,  648,
-  694,  707,  726,  750,  789,  800,  816,  836,  868,  877,  890,  906,  931,
-  938,  948,  960,  978,  983,  990,  998,  1009, 1012, 1016, 1020, 472,  497,
-  529,  553,  597,  619,  647,  668,  706,  725,  749,  767,  799,  815,  835,
-  850,  876,  889,  905,  917,  937,  947,  959,  968,  982,  989,  997,  1003,
-  1011, 1015, 1019, 1022, 496,  528,  552,  568,  618,  646,  667,  681,  724,
-  748,  766,  778,  814,  834,  849,  859,  888,  904,  916,  924,  946,  958,
-  967,  973,  988,  996,  1002, 1006, 1014, 1018, 1021, 1023,
+  1,    3,    6,    11,   18,   26,   39,   48,   63,   84,   102,  122,  146,
+  171,  194,  205,  211,  220,  230,  234,  246,  258,  276,  300,  343,  357,
+  378,  406,  456,  472,  496,  528,  2,    5,    9,    16,   23,   31,   46,
+  59,   75,   93,   113,  134,  159,  185,  204,  216,  223,  229,  235,  238,
+  257,  275,  299,  318,  356,  377,  405,  427,  471,  495,  527,  552,  4,
+  8,    13,   19,   29,   37,   53,   65,   83,   103,  119,  143,  165,  190,
+  209,  218,  225,  232,  236,  239,  274,  298,  317,  330,  376,  404,  426,
+  441,  494,  526,  551,  568,  7,    12,   17,   24,   32,   44,   61,   74,
+  91,   110,  127,  151,  174,  197,  212,  221,  227,  233,  237,  240,  297,
+  316,  329,  336,  403,  425,  440,  448,  525,  550,  567,  576,  10,   15,
+  20,   30,   38,   51,   66,   79,   96,   117,  135,  158,  180,  202,  215,
+  224,  245,  256,  273,  296,  342,  355,  375,  402,  455,  470,  493,  524,
+  583,  597,  618,  646,  14,   21,   27,   36,   45,   55,   73,   86,   106,
+  124,  141,  164,  183,  206,  217,  226,  255,  272,  295,  315,  354,  374,
+  401,  424,  469,  492,  523,  549,  596,  617,  645,  667,  22,   28,   34,
+  43,   54,   64,   81,   95,   114,  133,  152,  173,  191,  210,  219,  228,
+  271,  294,  314,  328,  373,  400,  423,  439,  491,  522,  548,  566,  616,
+  644,  666,  681,  25,   33,   40,   49,   58,   72,   89,   105,  121,  140,
+  160,  179,  198,  213,  222,  231,  293,  313,  327,  335,  399,  422,  438,
+  447,  521,  547,  565,  575,  643,  665,  680,  688,  35,   41,   47,   57,
+  69,   82,   97,   112,  131,  148,  168,  187,  244,  254,  270,  292,  341,
+  353,  372,  398,  454,  468,  490,  520,  582,  595,  615,  642,  694,  706,
+  724,  748,  42,   50,   56,   68,   78,   92,   108,  125,  139,  162,  178,
+  195,  253,  269,  291,  312,  352,  371,  397,  421,  467,  489,  519,  546,
+  594,  614,  641,  664,  705,  723,  747,  766,  52,   60,   67,   77,   90,
+  100,  120,  132,  150,  169,  182,  201,  268,  290,  311,  326,  370,  396,
+  420,  437,  488,  518,  545,  564,  613,  640,  663,  679,  722,  746,  765,
+  778,  62,   70,   76,   88,   101,  115,  130,  145,  163,  181,  192,  208,
+  289,  310,  325,  334,  395,  419,  436,  446,  517,  544,  563,  574,  639,
+  662,  678,  687,  745,  764,  777,  784,  71,   80,   87,   98,   109,  123,
+  138,  156,  243,  252,  267,  288,  340,  351,  369,  394,  453,  466,  487,
+  516,  581,  593,  612,  638,  693,  704,  721,  744,  789,  799,  814,  834,
+  85,   94,   104,  111,  126,  142,  155,  172,  251,  266,  287,  309,  350,
+  368,  393,  418,  465,  486,  515,  543,  592,  611,  637,  661,  703,  720,
+  743,  763,  798,  813,  833,  849,  99,   107,  116,  128,  144,  157,  170,
+  186,  265,  286,  308,  324,  367,  392,  417,  435,  485,  514,  542,  562,
+  610,  636,  660,  677,  719,  742,  762,  776,  812,  832,  848,  859,  118,
+  129,  137,  149,  161,  176,  189,  199,  285,  307,  323,  333,  391,  416,
+  434,  445,  513,  541,  561,  573,  635,  659,  676,  686,  741,  761,  775,
+  783,  831,  847,  858,  864,  136,  147,  153,  166,  242,  250,  264,  284,
+  339,  349,  366,  390,  452,  464,  484,  512,  580,  591,  609,  634,  692,
+  702,  718,  740,  788,  797,  811,  830,  868,  876,  888,  904,  154,  167,
+  175,  184,  249,  263,  283,  306,  348,  365,  389,  415,  463,  483,  511,
+  540,  590,  608,  633,  658,  701,  717,  739,  760,  796,  810,  829,  846,
+  875,  887,  903,  916,  177,  188,  196,  203,  262,  282,  305,  322,  364,
+  388,  414,  433,  482,  510,  539,  560,  607,  632,  657,  675,  716,  738,
+  759,  774,  809,  828,  845,  857,  886,  902,  915,  924,  193,  200,  207,
+  214,  281,  304,  321,  332,  387,  413,  432,  444,  509,  538,  559,  572,
+  631,  656,  674,  685,  737,  758,  773,  782,  827,  844,  856,  863,  901,
+  914,  923,  928,  241,  248,  261,  280,  338,  347,  363,  386,  451,  462,
+  481,  508,  579,  589,  606,  630,  691,  700,  715,  736,  787,  795,  808,
+  826,  867,  874,  885,  900,  931,  937,  946,  958,  247,  260,  279,  303,
+  346,  362,  385,  412,  461,  480,  507,  537,  588,  605,  629,  655,  699,
+  714,  735,  757,  794,  807,  825,  843,  873,  884,  899,  913,  936,  945,
+  957,  967,  259,  278,  302,  320,  361,  384,  411,  431,  479,  506,  536,
+  558,  604,  628,  654,  673,  713,  734,  756,  772,  806,  824,  842,  855,
+  883,  898,  912,  922,  944,  956,  966,  973,  277,  301,  319,  331,  383,
+  410,  430,  443,  505,  535,  557,  571,  627,  653,  672,  684,  733,  755,
+  771,  781,  823,  841,  854,  862,  897,  911,  921,  927,  955,  965,  972,
+  976,  337,  345,  360,  382,  450,  460,  478,  504,  578,  587,  603,  626,
+  690,  698,  712,  732,  786,  793,  805,  822,  866,  872,  882,  896,  930,
+  935,  943,  954,  978,  982,  988,  996,  344,  359,  381,  409,  459,  477,
+  503,  534,  586,  602,  625,  652,  697,  711,  731,  754,  792,  804,  821,
+  840,  871,  881,  895,  910,  934,  942,  953,  964,  981,  987,  995,  1002,
+  358,  380,  408,  429,  476,  502,  533,  556,  601,  624,  651,  671,  710,
+  730,  753,  770,  803,  820,  839,  853,  880,  894,  909,  920,  941,  952,
+  963,  971,  986,  994,  1001, 1006, 379,  407,  428,  442,  501,  532,  555,
+  570,  623,  650,  670,  683,  729,  752,  769,  780,  819,  838,  852,  861,
+  893,  908,  919,  926,  951,  962,  970,  975,  993,  1000, 1005, 1008, 449,
+  458,  475,  500,  577,  585,  600,  622,  689,  696,  709,  728,  785,  791,
+  802,  818,  865,  870,  879,  892,  929,  933,  940,  950,  977,  980,  985,
+  992,  1009, 1011, 1014, 1018, 457,  474,  499,  531,  584,  599,  621,  649,
+  695,  708,  727,  751,  790,  801,  817,  837,  869,  878,  891,  907,  932,
+  939,  949,  961,  979,  984,  991,  999,  1010, 1013, 1017, 1021, 473,  498,
+  530,  554,  598,  620,  648,  669,  707,  726,  750,  768,  800,  816,  836,
+  851,  877,  890,  906,  918,  938,  948,  960,  969,  983,  990,  998,  1004,
+  1012, 1016, 1020, 1023, 497,  529,  553,  569,  619,  647,  668,  682,  725,
+  749,  767,  779,  815,  835,  850,  860,  889,  905,  917,  925,  947,  959,
+  968,  974,  989,  997,  1003, 1007, 1015, 1019, 1022, 1024,
 };
 
 const scan_order vp9_default_scan_orders[TX_SIZES] = {
diff --git a/libvpx/vp9/decoder/vp9_decodemv.c b/libvpx/vp9/decoder/vp9_decodemv.c
index 8a8d2ad86..db3e74663 100644
--- a/libvpx/vp9/decoder/vp9_decodemv.c
+++ b/libvpx/vp9/decoder/vp9_decodemv.c
@@ -426,7 +426,9 @@ static INLINE int assign_mv(VP9_COMMON *cm, MACROBLOCKD *xd,
       zero_mv_pair(mv);
       break;
     }
-    default: { return 0; }
+    default: {
+      return 0;
+    }
   }
   return ret;
 }
@@ -755,7 +757,7 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi,
         if (!assign_mv(cm, xd, b_mode, mi->bmi[j].as_mv, best_ref_mvs,
                        best_sub8x8, is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
-          break;
+          return;
         }
 
         if (num_4x4_h == 2) mi->bmi[j + 2] = mi->bmi[j];
diff --git a/libvpx/vp9/decoder/vp9_detokenize.c b/libvpx/vp9/decoder/vp9_detokenize.c
index c2e6b3d54..3ed1bd6ff 100644
--- a/libvpx/vp9/decoder/vp9_detokenize.c
+++ b/libvpx/vp9/decoder/vp9_detokenize.c
@@ -133,17 +133,18 @@ static int decode_coefs(const MACROBLOCKD *xd, PLANE_TYPE type,
   int16_t dqv = dq[0];
   const uint8_t *const cat6_prob =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12)
-          ? vp9_cat6_prob_high12
-          : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2 :
+      (xd->bd == VPX_BITS_12)   ? vp9_cat6_prob_high12
+      : (xd->bd == VPX_BITS_10) ? vp9_cat6_prob_high12 + 2
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                    vp9_cat6_prob;
+                                vp9_cat6_prob;
   const int cat6_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (xd->bd == VPX_BITS_12) ? 18
-                              : (xd->bd == VPX_BITS_10) ? 16 :
+      (xd->bd == VPX_BITS_12)   ? 18
+      : (xd->bd == VPX_BITS_10) ? 16
+                                :
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-                                                        14;
+                                14;
   // Keep value, range, and count as locals.  The compiler produces better
   // results with the locals than using r directly.
   BD_VALUE value = r->value;
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
index a07a1608d..5961be5f3 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -18,28 +18,30 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
                                    int stride) {
-  // { 0, 1, 1, 1, 1, 1, 1, 1 };
-  const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7);
-  // { 1, 0, 0, 0, 0, 0, 0, 0 };
-  const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7);
-  int16x8_t mask;
+  // { 0, 1, 1, 1 };
+  const int16x4_t nonzero_bias_a = vext_s16(vdup_n_s16(0), vdup_n_s16(1), 3);
+  // { 1, 0, 0, 0 };
+  const int16x4_t nonzero_bias_b = vext_s16(vdup_n_s16(1), vdup_n_s16(0), 3);
+  int16x4_t mask;
 
   int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
   int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
   int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
   int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
 
-  in[0] = vcombine_s16(input_0, input_1);
-  in[1] = vcombine_s16(input_2, input_3);
-
   // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
   // one non-zero first elements
-  mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a));
-  in[0] = vaddq_s16(in[0], mask);
-  in[0] = vaddq_s16(in[0], nonzero_bias_b);
+  mask = vreinterpret_s16_u16(vceq_s16(input_0, nonzero_bias_a));
+  input_0 = vadd_s16(input_0, mask);
+  input_0 = vadd_s16(input_0, nonzero_bias_b);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
 }
 
 static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
@@ -53,72 +55,54 @@ static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
 }
 
 static INLINE void fadst4x4_neon(int16x8_t *in) {
-  int32x4_t u0, u1, u2, u3;
-  int16x4_t out_0, out_1, out_2, out_3;
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+  int32x4_t u[4], t[4];
+  int16x4_t s[4], out[4];
 
-  const int16x4_t s0 = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
-  const int16x4_t s1 = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
-  const int16x4_t s2 = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
-  const int16x4_t s3 = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+  s[0] = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  s[1] = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  s[2] = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  s[3] = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
 
-  // s0 * sinpi_1_9, s0 * sinpi_4_9
   // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-  const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9);
-  const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9);
-  // s1 * sinpi_1_9, s1 * sinpi_2_9
-  const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9);
-  const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9);
-  // s2 * sinpi_3_9
-  const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9);
-  // s3 * sinpi_2_9, s3 * sinpi_4_9
-  const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9);
-  const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9);
-
-  // (s0 + s1) * sinpi_3_9
-  const int32x4_t s0_p_s1 = vaddl_s16(s0, s1);
-  const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3);
-
-  // s_0 * sinpi_1_9 + s_1 * sinpi_2_9
-  // s_0 * sinpi_4_9 - s_1 * sinpi_1_9
-  const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9);
-  const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9);
-  /*
-   * t0 = s0s1_9 + s1s2_9 + s3s4_9
-   * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
-   * t2 = s0s4_9 - s1s1_9 + s3s2_9
-   * t3 = s2s3_9
-   */
-  const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9);
-  const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9);
-  const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9);
-  const int32x4_t t3 = s2s3_9;
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t[0] = vmull_n_s16(s[0], sinpi_1_9);
+  t[0] = vmlal_n_s16(t[0], s[1], sinpi_2_9);
+  t[0] = vmlal_n_s16(t[0], s[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t[1] = vmull_n_s16(s[0], sinpi_3_9);
+  t[1] = vmlal_n_s16(t[1], s[1], sinpi_3_9);
+  t[1] = vmlsl_n_s16(t[1], s[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t[2] = vmull_n_s16(s[0], sinpi_4_9);
+  t[2] = vmlsl_n_s16(t[2], s[1], sinpi_1_9);
+  t[2] = vmlal_n_s16(t[2], s[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t[3] = vmull_n_s16(s[2], sinpi_3_9);
+
   /*
    * u0 = t0 + t3
    * u1 = t1
    * u2 = t2 - t3
    * u3 = t2 - t0 + t3
    */
-  u0 = vaddq_s32(t0, t3);
-  u1 = t1;
-  u2 = vsubq_s32(t2, t3);
-  u3 = vaddq_s32(vsubq_s32(t2, t0), t3);
+  u[0] = vaddq_s32(t[0], t[3]);
+  u[1] = t[1];
+  u[2] = vsubq_s32(t[2], t[3]);
+  u[3] = vaddq_s32(vsubq_s32(t[2], t[0]), t[3]);
 
   // fdct_round_shift
-  u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING);
-  u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING);
-  u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING);
-  u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING);
-
-  out_0 = vshrn_n_s32(u0, DCT_CONST_BITS);
-  out_1 = vshrn_n_s32(u1, DCT_CONST_BITS);
-  out_2 = vshrn_n_s32(u2, DCT_CONST_BITS);
-  out_3 = vshrn_n_s32(u3, DCT_CONST_BITS);
+  out[0] = vrshrn_n_s32(u[0], DCT_CONST_BITS);
+  out[1] = vrshrn_n_s32(u[1], DCT_CONST_BITS);
+  out[2] = vrshrn_n_s32(u[2], DCT_CONST_BITS);
+  out[3] = vrshrn_n_s32(u[3], DCT_CONST_BITS);
 
-  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
 
-  in[0] = vcombine_s16(out_0, out_1);
-  in[1] = vcombine_s16(out_2, out_3);
+  in[0] = vcombine_s16(out[0], out[1]);
+  in[1] = vcombine_s16(out[2], out[3]);
 }
 
 void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
@@ -130,12 +114,14 @@ void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
     case ADST_DCT:
       load_buffer_4x4(input, in, stride);
       fadst4x4_neon(in);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
       load_buffer_4x4(input, in, stride);
-      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      // pass1 variant is not accurate enough
+      vpx_fdct4x4_pass2_neon((int16x4_t *)in);
       fadst4x4_neon(in);
       write_buffer_4x4(output, in);
       break;
@@ -235,245 +221,158 @@ static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
 }
 
 static INLINE void fadst8x8_neon(int16x8_t *in) {
-  int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo,
-      x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi;
-  int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo,
-      s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi;
-  int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo,
-      t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi;
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
-
-  x0_lo = vget_low_s16(in[7]);
-  x0_hi = vget_high_s16(in[7]);
-  x1_lo = vget_low_s16(in[0]);
-  x1_hi = vget_high_s16(in[0]);
-  x2_lo = vget_low_s16(in[5]);
-  x2_hi = vget_high_s16(in[5]);
-  x3_lo = vget_low_s16(in[2]);
-  x3_hi = vget_high_s16(in[2]);
-  x4_lo = vget_low_s16(in[3]);
-  x4_hi = vget_high_s16(in[3]);
-  x5_lo = vget_low_s16(in[4]);
-  x5_hi = vget_high_s16(in[4]);
-  x6_lo = vget_low_s16(in[1]);
-  x6_hi = vget_high_s16(in[1]);
-  x7_lo = vget_low_s16(in[6]);
-  x7_hi = vget_high_s16(in[6]);
+  int16x4_t x_lo[8], x_hi[8];
+  int32x4_t s_lo[8], s_hi[8];
+  int32x4_t t_lo[8], t_hi[8];
+
+  x_lo[0] = vget_low_s16(in[7]);
+  x_hi[0] = vget_high_s16(in[7]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[5]);
+  x_hi[2] = vget_high_s16(in[5]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[3]);
+  x_hi[4] = vget_high_s16(in[3]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[1]);
+  x_hi[6] = vget_high_s16(in[1]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
 
   // stage 1
   // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
-  s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64),
-                    vmull_n_s16(x1_lo, cospi_30_64));
-  s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64),
-                    vmull_n_s16(x1_hi, cospi_30_64));
   // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
-  s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64),
-                    vmull_n_s16(x1_lo, cospi_2_64));
-  s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64),
-                    vmull_n_s16(x1_hi, cospi_2_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_2_64, cospi_30_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
+
   // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
-  s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64),
-                    vmull_n_s16(x3_lo, cospi_22_64));
-  s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64),
-                    vmull_n_s16(x3_hi, cospi_22_64));
   // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
-  s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64),
-                    vmull_n_s16(x3_lo, cospi_10_64));
-  s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64),
-                    vmull_n_s16(x3_hi, cospi_10_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_10_64, cospi_22_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
+
   // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
-  s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64),
-                    vmull_n_s16(x5_lo, cospi_14_64));
-  s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64),
-                    vmull_n_s16(x5_hi, cospi_14_64));
   // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
-  s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64),
-                    vmull_n_s16(x5_lo, cospi_18_64));
-  s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64),
-                    vmull_n_s16(x5_hi, cospi_18_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_18_64, cospi_14_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
+
   // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
-  s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64),
-                    vmull_n_s16(x7_lo, cospi_6_64));
-  s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64),
-                    vmull_n_s16(x7_hi, cospi_6_64));
   // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
-  s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64),
-                    vmull_n_s16(x7_lo, cospi_26_64));
-  s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64),
-                    vmull_n_s16(x7_hi, cospi_26_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_26_64, cospi_6_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
 
   // fdct_round_shift
-  t0_lo = vaddq_s32(s0_lo, s4_lo);
-  t0_hi = vaddq_s32(s0_hi, s4_hi);
-  t1_lo = vaddq_s32(s1_lo, s5_lo);
-  t1_hi = vaddq_s32(s1_hi, s5_hi);
-  t2_lo = vaddq_s32(s2_lo, s6_lo);
-  t2_hi = vaddq_s32(s2_hi, s6_hi);
-  t3_lo = vaddq_s32(s3_lo, s7_lo);
-  t3_hi = vaddq_s32(s3_hi, s7_hi);
-  t4_lo = vsubq_s32(s0_lo, s4_lo);
-  t4_hi = vsubq_s32(s0_hi, s4_hi);
-  t5_lo = vsubq_s32(s1_lo, s5_lo);
-  t5_hi = vsubq_s32(s1_hi, s5_hi);
-  t6_lo = vsubq_s32(s2_lo, s6_lo);
-  t6_hi = vsubq_s32(s2_hi, s6_hi);
-  t7_lo = vsubq_s32(s3_lo, s7_lo);
-  t7_hi = vsubq_s32(s3_hi, s7_hi);
-
-  t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING);
-  t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING);
-  t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING);
-  t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING);
-  t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING);
-  t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING);
-  t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING);
-  t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING);
-  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
-  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
-  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
-  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
-
-  t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS);
-  t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS);
-  t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS);
-  t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS);
-  t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS);
-  t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS);
-  t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS);
-  t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS);
-  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
-  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
-  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
-  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
-  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
-  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
-  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
-  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[4]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[4]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[5]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[5]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[6]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[7]), DCT_CONST_BITS);
 
   // stage 2
-  s0_lo = t0_lo;
-  s0_hi = t0_hi;
-  s1_lo = t1_lo;
-  s1_hi = t1_hi;
-  s2_lo = t2_lo;
-  s2_hi = t2_hi;
-  s3_lo = t3_lo;
-  s3_hi = t3_hi;
-  s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64),
-                    vmulq_n_s32(t5_lo, cospi_24_64));
-  s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64),
-                    vmulq_n_s32(t5_hi, cospi_24_64));
-  s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64),
-                    vmulq_n_s32(t5_lo, cospi_8_64));
-  s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64),
-                    vmulq_n_s32(t5_hi, cospi_8_64));
-  s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64),
-                    vmulq_n_s32(t7_lo, cospi_8_64));
-  s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64),
-                    vmulq_n_s32(t7_hi, cospi_8_64));
-  s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64),
-                    vmulq_n_s32(t7_lo, cospi_24_64));
-  s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64),
-                    vmulq_n_s32(t7_hi, cospi_24_64));
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+  // s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
+
+  // s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+  // s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+  butterfly_two_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  -cospi_24_64, cospi_8_64, &s_lo[6], &s_hi[6],
+                                  &s_lo[7], &s_hi[7]);
 
+  // fdct_round_shift
   // s0 + s2
-  t0_lo = vaddq_s32(s0_lo, s2_lo);
-  t0_hi = vaddq_s32(s0_hi, s2_hi);
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
   // s1 + s3
-  t1_lo = vaddq_s32(s1_lo, s3_lo);
-  t1_hi = vaddq_s32(s1_hi, s3_hi);
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
   // s0 - s2
-  t2_lo = vsubq_s32(s0_lo, s2_lo);
-  t2_hi = vsubq_s32(s0_hi, s2_hi);
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
   // s1 - s3
-  t3_lo = vsubq_s32(s1_lo, s3_lo);
-  t3_hi = vsubq_s32(s1_hi, s3_hi);
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
   // s4 + s6
-  t4_lo = vaddq_s32(s4_lo, s6_lo);
-  t4_hi = vaddq_s32(s4_hi, s6_hi);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
   // s5 + s7
-  t5_lo = vaddq_s32(s5_lo, s7_lo);
-  t5_hi = vaddq_s32(s5_hi, s7_hi);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
   // s4 - s6
-  t6_lo = vsubq_s32(s4_lo, s6_lo);
-  t6_hi = vsubq_s32(s4_hi, s6_hi);
+  t_lo[6] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[6]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[6]), DCT_CONST_BITS);
   // s5 - s7
-  t7_lo = vsubq_s32(s5_lo, s7_lo);
-  t7_hi = vsubq_s32(s5_hi, s7_hi);
-
-  // fdct_round_shift
-  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
-  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
-  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
-  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
-  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
-  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
-  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
-  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
-  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
-  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
-  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
-  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[7]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[7]), DCT_CONST_BITS);
 
   // stage 3
   // cospi_16_64 * (x2 + x3)
-  s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64);
-  s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64);
   // cospi_16_64 * (x2 - x3)
-  s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64);
-  s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[2], t_hi[2], t_lo[3], t_hi[3],
+                                  cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
+
   // cospi_16_64 * (x6 + x7)
-  s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64);
-  s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64);
   // cospi_16_64 * (x2 - x3)
-  s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64);
-  s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[6], t_hi[6], t_lo[7], t_hi[7],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
 
   // final fdct_round_shift
-  t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING);
-  t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING);
-  t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING);
-  t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING);
-  t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING);
-  t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING);
-  t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING);
-  t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING);
-
-  x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS);
-  x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS);
-  x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS);
-  x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS);
-  x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS);
-  x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS);
-  x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS);
-  x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS);
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
 
   // x0, x1, x4, x5 narrow down to 16-bits directly
-  x0_lo = vmovn_s32(t0_lo);
-  x0_hi = vmovn_s32(t0_hi);
-  x1_lo = vmovn_s32(t1_lo);
-  x1_hi = vmovn_s32(t1_hi);
-  x4_lo = vmovn_s32(t4_lo);
-  x4_hi = vmovn_s32(t4_hi);
-  x5_lo = vmovn_s32(t5_lo);
-  x5_hi = vmovn_s32(t5_hi);
-
-  in[0] = vcombine_s16(x0_lo, x0_hi);
-  in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi));
-  in[2] = vcombine_s16(x6_lo, x6_hi);
-  in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi));
-  in[4] = vcombine_s16(x3_lo, x3_hi);
-  in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi));
-  in[6] = vcombine_s16(x5_lo, x5_hi);
-  in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi));
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[2] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[2], x_hi[2]));
+  in[4] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[5] = vnegq_s16(vcombine_s16(x_lo[7], x_hi[7]));
+  in[6] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[7] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
 
   transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
                     &in[7]);
@@ -488,13 +387,15 @@ void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
     case ADST_DCT:
       load_buffer_8x8(input, in, stride);
       fadst8x8_neon(in);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
       load_buffer_8x8(input, in, stride);
-      vpx_fdct8x8_pass1_neon(in);
+      // pass1 variant is not accurate enough
+      vpx_fdct8x8_pass2_neon(in);
       fadst8x8_neon(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
@@ -547,7 +448,6 @@ static void fdct16_8col(int16x8_t *in) {
   int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
   int16x4_t t_lo[8], t_hi[8];
   int32x4_t u_lo[8], u_hi[8];
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
 
   // stage 1
   i[0] = vaddq_s16(in[0], in[15]);
@@ -559,7 +459,8 @@ static void fdct16_8col(int16x8_t *in) {
   i[6] = vaddq_s16(in[6], in[9]);
   i[7] = vaddq_s16(in[7], in[8]);
 
-  vpx_fdct8x8_pass1_neon(i);
+  // pass1 variant is not accurate enough
+  vpx_fdct8x8_pass2_neon(i);
   transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
 
   // step 2
@@ -595,23 +496,14 @@ static void fdct16_8col(int16x8_t *in) {
   u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
   u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
 
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
-  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
-  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
-  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
 
   s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
   s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
@@ -646,40 +538,26 @@ static void fdct16_8col(int16x8_t *in) {
   t_lo[7] = vget_low_s16(s3[7]);
   t_hi[7] = vget_high_s16(s3[7]);
 
-  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64),
-                      vmull_n_s16(t_lo[6], cospi_24_64));
-  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64),
-                      vmull_n_s16(t_hi[6], cospi_24_64));
-  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64),
-                      vmull_n_s16(t_lo[5], cospi_8_64));
-  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64),
-                      vmull_n_s16(t_hi[5], cospi_8_64));
-  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64),
-                      vmull_n_s16(t_lo[5], -cospi_24_64));
-  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64),
-                      vmull_n_s16(t_hi[5], -cospi_24_64));
-  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64),
-                      vmull_n_s16(t_lo[6], cospi_8_64));
-  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64),
-                      vmull_n_s16(t_hi[6], cospi_8_64));
-
-  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
-  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
-  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
-
-  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  // u[1] = -cospi_8_64 * t[1] + cospi_24_64 * t[6]
+  // u[6] = cospi_24_64 * t[1] + cospi_8_64 * t[6]
+  butterfly_two_coeff_s16_s32_noround(t_lo[1], t_hi[1], t_lo[6], t_hi[6],
+                                      -cospi_8_64, cospi_24_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[5] = -cospi_24_64 * t[5] + cospi_8_64 * t[2]
+  // u[2] = cospi_8_64 * t[5]   + cospi_24_64 * t[2]
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      -cospi_24_64, cospi_8_64, &u_lo[5],
+                                      &u_hi[5], &u_lo[2], &u_hi[2]);
+
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
 
   s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
   s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
@@ -714,88 +592,47 @@ static void fdct16_8col(int16x8_t *in) {
   t_lo[7] = vget_low_s16(s1[7]);
   t_hi[7] = vget_high_s16(s1[7]);
 
-  // step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-  u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64),
-                      vmull_n_s16(t_lo[7], cospi_2_64));
-  u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64),
-                      vmull_n_s16(t_hi[7], cospi_2_64));
-
-  // step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64),
-                      vmull_n_s16(t_lo[6], cospi_18_64));
-  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64),
-                      vmull_n_s16(t_hi[6], cospi_18_64));
-
-  // step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64),
-                      vmull_n_s16(t_lo[5], cospi_10_64));
-  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64),
-                      vmull_n_s16(t_hi[5], cospi_10_64));
-
-  // step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-  u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64),
-                      vmull_n_s16(t_lo[4], cospi_26_64));
-  u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64),
-                      vmull_n_s16(t_hi[4], cospi_26_64));
-
-  // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-  u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64),
-                      vmull_n_s16(t_lo[4], cospi_6_64));
-  u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64),
-                      vmull_n_s16(t_hi[4], cospi_6_64));
-
-  // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64),
-                      vmull_n_s16(t_lo[5], cospi_22_64));
-  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64),
-                      vmull_n_s16(t_hi[5], cospi_22_64));
-
-  // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64),
-                      vmull_n_s16(t_lo[6], cospi_14_64));
-  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64),
-                      vmull_n_s16(t_hi[6], cospi_14_64));
-
-  // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-  u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64),
-                      vmull_n_s16(t_lo[7], cospi_30_64));
-  u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64),
-                      vmull_n_s16(t_hi[7], cospi_30_64));
+  // u[0] = step1[7] * cospi_2_64 + step1[0] * cospi_30_64
+  // u[7] = step1[7] * cospi_30_64 - step1[0] * cospi_2_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[7], t_hi[7], t_lo[0], t_hi[0],
+                                      cospi_2_64, cospi_30_64, &u_lo[0],
+                                      &u_hi[0], &u_lo[7], &u_hi[7]);
+
+  // u[1] = step1[6] * cospi_18_64 + step1[1] * cospi_14_64
+  // u[6] = step1[6] * cospi_14_64 - step1[1] * cospi_18_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[6], t_hi[6], t_lo[1], t_hi[1],
+                                      cospi_18_64, cospi_14_64, &u_lo[1],
+                                      &u_hi[1], &u_lo[6], &u_hi[6]);
+
+  // u[2] = step1[5] * cospi_10_64 + step1[2] * cospi_22_64
+  // u[5] = step1[5] * cospi_22_64 - step1[2] * cospi_10_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[5], t_hi[5], t_lo[2], t_hi[2],
+                                      cospi_10_64, cospi_22_64, &u_lo[2],
+                                      &u_hi[2], &u_lo[5], &u_hi[5]);
+
+  // u[3] = step1[4] * cospi_26_64 + step1[3] * cospi_6_64
+  // u[4] = step1[4] * cospi_6_64  - step1[3] * cospi_26_64
+  butterfly_two_coeff_s16_s32_noround(t_lo[4], t_hi[4], t_lo[3], t_hi[3],
+                                      cospi_26_64, cospi_6_64, &u_lo[3],
+                                      &u_hi[3], &u_lo[4], &u_hi[4]);
 
   // final fdct_round_shift
-  u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING);
-  u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING);
-  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
-  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
-  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
-  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
-  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
-  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
-  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
-  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
-  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
-  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
-  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
-  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
-  u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING);
-  u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING);
-
-  t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS);
-  t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS);
-  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+  t_lo[0] = vrshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vrshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vrshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vrshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vrshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vrshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vrshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vrshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vrshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrn_n_s32(u_hi[7], DCT_CONST_BITS);
 
   in[0] = i[0];
   in[2] = i[1];
@@ -820,7 +657,6 @@ static void fadst16_8col(int16x8_t *in) {
   int16x4_t x_lo[16], x_hi[16];
   int32x4_t s_lo[16], s_hi[16];
   int32x4_t t_lo[16], t_hi[16];
-  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
 
   x_lo[0] = vget_low_s16(in[15]);
   x_hi[0] = vget_high_s16(in[15]);
@@ -857,185 +693,79 @@ static void fadst16_8col(int16x8_t *in) {
 
   // stage 1
   // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
-  s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64),
-                      vmull_n_s16(x_lo[1], cospi_31_64));
-  s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64),
-                      vmull_n_s16(x_hi[1], cospi_31_64));
   // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
-  s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64),
-                      vmull_n_s16(x_lo[1], cospi_1_64));
-  s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64),
-                      vmull_n_s16(x_hi[1], cospi_1_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[0], x_hi[0], x_lo[1], x_hi[1],
+                                      cospi_1_64, cospi_31_64, &s_lo[0],
+                                      &s_hi[0], &s_lo[1], &s_hi[1]);
   // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
-  s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64),
-                      vmull_n_s16(x_lo[3], cospi_27_64));
-  s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64),
-                      vmull_n_s16(x_hi[3], cospi_27_64));
   // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
-  s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64),
-                      vmull_n_s16(x_lo[3], cospi_5_64));
-  s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64),
-                      vmull_n_s16(x_hi[3], cospi_5_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[2], x_hi[2], x_lo[3], x_hi[3],
+                                      cospi_5_64, cospi_27_64, &s_lo[2],
+                                      &s_hi[2], &s_lo[3], &s_hi[3]);
   // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
-  s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64),
-                      vmull_n_s16(x_lo[5], cospi_23_64));
-  s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64),
-                      vmull_n_s16(x_hi[5], cospi_23_64));
   // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
-  s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64),
-                      vmull_n_s16(x_lo[5], cospi_9_64));
-  s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64),
-                      vmull_n_s16(x_hi[5], cospi_9_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[4], x_hi[4], x_lo[5], x_hi[5],
+                                      cospi_9_64, cospi_23_64, &s_lo[4],
+                                      &s_hi[4], &s_lo[5], &s_hi[5]);
   // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
-  s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64),
-                      vmull_n_s16(x_lo[7], cospi_19_64));
-  s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64),
-                      vmull_n_s16(x_hi[7], cospi_19_64));
   // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
-  s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64),
-                      vmull_n_s16(x_lo[7], cospi_13_64));
-  s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64),
-                      vmull_n_s16(x_hi[7], cospi_13_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[6], x_hi[6], x_lo[7], x_hi[7],
+                                      cospi_13_64, cospi_19_64, &s_lo[6],
+                                      &s_hi[6], &s_lo[7], &s_hi[7]);
   // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
-  s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64),
-                      vmull_n_s16(x_lo[9], cospi_15_64));
-  s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64),
-                      vmull_n_s16(x_hi[9], cospi_15_64));
   // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
-  s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64),
-                      vmull_n_s16(x_lo[9], cospi_17_64));
-  s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64),
-                      vmull_n_s16(x_hi[9], cospi_17_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[8], x_hi[8], x_lo[9], x_hi[9],
+                                      cospi_17_64, cospi_15_64, &s_lo[8],
+                                      &s_hi[8], &s_lo[9], &s_hi[9]);
   // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
-  s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64),
-                       vmull_n_s16(x_lo[11], cospi_11_64));
-  s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64),
-                       vmull_n_s16(x_hi[11], cospi_11_64));
   // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
-  s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64),
-                       vmull_n_s16(x_lo[11], cospi_21_64));
-  s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64),
-                       vmull_n_s16(x_hi[11], cospi_21_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[10], x_hi[10], x_lo[11], x_hi[11],
+                                      cospi_21_64, cospi_11_64, &s_lo[10],
+                                      &s_hi[10], &s_lo[11], &s_hi[11]);
   // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
-  s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64),
-                       vmull_n_s16(x_lo[13], cospi_7_64));
-  s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64),
-                       vmull_n_s16(x_hi[13], cospi_7_64));
   // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
-  s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64),
-                       vmull_n_s16(x_lo[13], cospi_25_64));
-  s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64),
-                       vmull_n_s16(x_hi[13], cospi_25_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[12], x_hi[12], x_lo[13], x_hi[13],
+                                      cospi_25_64, cospi_7_64, &s_lo[12],
+                                      &s_hi[12], &s_lo[13], &s_hi[13]);
   // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
-  s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64),
-                       vmull_n_s16(x_lo[15], cospi_3_64));
-  s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64),
-                       vmull_n_s16(x_hi[15], cospi_3_64));
   // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
-  s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64),
-                       vmull_n_s16(x_lo[15], cospi_29_64));
-  s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64),
-                       vmull_n_s16(x_hi[15], cospi_29_64));
+  butterfly_two_coeff_s16_s32_noround(x_lo[14], x_hi[14], x_lo[15], x_hi[15],
+                                      cospi_29_64, cospi_3_64, &s_lo[14],
+                                      &s_hi[14], &s_lo[15], &s_hi[15]);
 
   // fdct_round_shift
-  t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]);
-  t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]);
-  t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]);
-  t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]);
-  t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]);
-  t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]);
-  t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]);
-  t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]);
-  t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]);
-  t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]);
-  t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]);
-  t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]);
-  t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]);
-  t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]);
-  t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]);
-  t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]);
-  t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]);
-  t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]);
-  t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]);
-  t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]);
-  t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]);
-  t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]);
-  t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]);
-  t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]);
-  t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]);
-  t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]);
-  t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]);
-  t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]);
-  t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]);
-  t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]);
-  t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]);
-  t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]);
-
-  t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING);
-  t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING);
-  t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING);
-  t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING);
-  t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING);
-  t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING);
-  t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING);
-  t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING);
-  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
-  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
-  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
-  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
-  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
-  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
-  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-
-  t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS);
-  t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS);
-  t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS);
-  t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS);
-  t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS);
-  t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS);
-  t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS);
-  t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS);
-  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
-  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
-  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
-  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
-  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
-  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
-  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
-  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
-  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[0] = vrshrq_n_s32(vaddq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[0] = vrshrq_n_s32(vaddq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[1] = vrshrq_n_s32(vaddq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[1] = vrshrq_n_s32(vaddq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[2] = vrshrq_n_s32(vaddq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[2] = vrshrq_n_s32(vaddq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[3] = vrshrq_n_s32(vaddq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[3] = vrshrq_n_s32(vaddq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(vaddq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(vaddq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(vaddq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(vaddq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(vaddq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(vaddq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(vaddq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(vaddq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(vsubq_s32(s_lo[0], s_lo[8]), DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(vsubq_s32(s_hi[0], s_hi[8]), DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(vsubq_s32(s_lo[1], s_lo[9]), DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(vsubq_s32(s_hi[1], s_hi[9]), DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(vsubq_s32(s_lo[2], s_lo[10]), DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(vsubq_s32(s_hi[2], s_hi[10]), DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(vsubq_s32(s_lo[3], s_lo[11]), DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(vsubq_s32(s_hi[3], s_hi[11]), DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(vsubq_s32(s_lo[4], s_lo[12]), DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(vsubq_s32(s_hi[4], s_hi[12]), DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(vsubq_s32(s_lo[5], s_lo[13]), DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(vsubq_s32(s_hi[5], s_hi[13]), DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(vsubq_s32(s_lo[6], s_lo[14]), DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(vsubq_s32(s_hi[6], s_hi[14]), DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(vsubq_s32(s_lo[7], s_lo[15]), DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(vsubq_s32(s_hi[7], s_hi[15]), DCT_CONST_BITS);
 
   // stage 2
   s_lo[0] = t_lo[0];
@@ -1055,45 +785,25 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[7] = t_lo[7];
   s_hi[7] = t_hi[7];
   // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
-  s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64),
-                      vmulq_n_s32(t_lo[9], cospi_28_64));
-  s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64),
-                      vmulq_n_s32(t_hi[9], cospi_28_64));
   // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
-  s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64),
-                      vmulq_n_s32(t_lo[9], cospi_4_64));
-  s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64),
-                      vmulq_n_s32(t_hi[9], cospi_4_64));
+  butterfly_two_coeff_s32_noround(t_lo[8], t_hi[8], t_lo[9], t_hi[9],
+                                  cospi_4_64, cospi_28_64, &s_lo[8], &s_hi[8],
+                                  &s_lo[9], &s_hi[9]);
   // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
-  s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64),
-                       vmulq_n_s32(t_lo[11], cospi_12_64));
-  s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64),
-                       vmulq_n_s32(t_hi[11], cospi_12_64));
   // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
-  s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64),
-                       vmulq_n_s32(t_lo[11], cospi_20_64));
-  s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64),
-                       vmulq_n_s32(t_hi[11], cospi_20_64));
+  butterfly_two_coeff_s32_noround(t_lo[10], t_hi[10], t_lo[11], t_hi[11],
+                                  cospi_20_64, cospi_12_64, &s_lo[10],
+                                  &s_hi[10], &s_lo[11], &s_hi[11]);
   // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
-  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64),
-                       vmulq_n_s32(t_lo[13], cospi_4_64));
-  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64),
-                       vmulq_n_s32(t_hi[13], cospi_4_64));
   // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
-  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64),
-                       vmulq_n_s32(t_lo[13], cospi_28_64));
-  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64),
-                       vmulq_n_s32(t_hi[13], cospi_28_64));
+  butterfly_two_coeff_s32_noround(t_lo[13], t_hi[13], t_lo[12], t_hi[12],
+                                  cospi_28_64, cospi_4_64, &s_lo[13], &s_hi[13],
+                                  &s_lo[12], &s_hi[12]);
   // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
-  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64),
-                       vmulq_n_s32(t_lo[15], cospi_20_64));
-  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64),
-                       vmulq_n_s32(t_hi[15], cospi_20_64));
   // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64),
-                       vmulq_n_s32(t_lo[15], cospi_12_64));
-  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64),
-                       vmulq_n_s32(t_hi[15], cospi_12_64));
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_12_64, cospi_20_64, &s_lo[15],
+                                  &s_hi[15], &s_lo[14], &s_hi[14]);
 
   // s0 + s4
   t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
@@ -1144,38 +854,22 @@ static void fadst16_8col(int16x8_t *in) {
   t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
   t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
 
-  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
-  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
-  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
-  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
-  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
-  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
-  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
-  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
-  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
-  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
-  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[8] = vrshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vrshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vrshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vrshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vrshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vrshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vrshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vrshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
 
   // stage 3
   s_lo[0] = t_lo[0];
@@ -1187,25 +881,15 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[3] = t_lo[3];
   s_hi[3] = t_hi[3];
   // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
-  s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64),
-                      vmulq_n_s32(t_lo[5], cospi_24_64));
-  s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64),
-                      vmulq_n_s32(t_hi[5], cospi_24_64));
   // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
-  s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64),
-                      vmulq_n_s32(t_lo[5], -cospi_8_64));
-  s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64),
-                      vmulq_n_s32(t_hi[5], -cospi_8_64));
+  butterfly_two_coeff_s32_noround(t_lo[4], t_hi[4], t_lo[5], t_hi[5],
+                                  cospi_8_64, cospi_24_64, &s_lo[4], &s_hi[4],
+                                  &s_lo[5], &s_hi[5]);
   // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
-  s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64),
-                      vmulq_n_s32(t_lo[7], cospi_8_64));
-  s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64),
-                      vmulq_n_s32(t_hi[7], cospi_8_64));
   // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
-  s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64),
-                      vmulq_n_s32(t_lo[7], cospi_24_64));
-  s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64),
-                      vmulq_n_s32(t_hi[7], cospi_24_64));
+  butterfly_two_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_24_64, cospi_8_64, &s_lo[7], &s_hi[7],
+                                  &s_lo[6], &s_hi[6]);
   s_lo[8] = t_lo[8];
   s_hi[8] = t_hi[8];
   s_lo[9] = t_lo[9];
@@ -1215,25 +899,15 @@ static void fadst16_8col(int16x8_t *in) {
   s_lo[11] = t_lo[11];
   s_hi[11] = t_hi[11];
   // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
-  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64),
-                       vmulq_n_s32(t_lo[13], cospi_24_64));
-  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64),
-                       vmulq_n_s32(t_hi[13], cospi_24_64));
   // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
-  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64),
-                       vmulq_n_s32(t_lo[13], -cospi_8_64));
-  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64),
-                       vmulq_n_s32(t_hi[13], -cospi_8_64));
+  butterfly_two_coeff_s32_noround(t_lo[12], t_hi[12], t_lo[13], t_hi[13],
+                                  cospi_8_64, cospi_24_64, &s_lo[12], &s_hi[12],
+                                  &s_lo[13], &s_hi[13]);
   // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
-  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64),
-                       vmulq_n_s32(t_lo[15], cospi_8_64));
-  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64),
-                       vmulq_n_s32(t_hi[15], cospi_8_64));
   // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64),
-                       vmulq_n_s32(t_lo[15], cospi_24_64));
-  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64),
-                       vmulq_n_s32(t_hi[15], cospi_24_64));
+  butterfly_two_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  cospi_24_64, cospi_8_64, &s_lo[15], &s_hi[15],
+                                  &s_lo[14], &s_hi[14]);
 
   // s0 + s4
   t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
@@ -1284,99 +958,62 @@ static void fadst16_8col(int16x8_t *in) {
   t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
   t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
 
-  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
-  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
-  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
-  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
-  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
-  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
-  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
-  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
-  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
-  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
-  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
-  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
-  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
-  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
-  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
-  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
-  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
-  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
-  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
-  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
-  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
-  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
-  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+  t_lo[4] = vrshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vrshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vrshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vrshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vrshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vrshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vrshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vrshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vrshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vrshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vrshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vrshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vrshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vrshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vrshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vrshrq_n_s32(t_hi[15], DCT_CONST_BITS);
 
   // stage 4
   // s2 = (-cospi_16_64) * (x2 + x3);
-  s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64);
-  s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64);
   // s3 = cospi_16_64 * (x2 - x3);
-  s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64);
-  s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[3], t_hi[3], t_lo[2], t_hi[2],
+                                  -cospi_16_64, &s_lo[2], &s_hi[2], &s_lo[3],
+                                  &s_hi[3]);
   // s6 = cospi_16_64 * (x6 + x7);
-  s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64);
-  s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64);
   // s7 = cospi_16_64 * (-x6 + x7);
-  s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64);
-  s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[7], t_hi[7], t_lo[6], t_hi[6],
+                                  cospi_16_64, &s_lo[6], &s_hi[6], &s_lo[7],
+                                  &s_hi[7]);
   // s10 = cospi_16_64 * (x10 + x11);
-  s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64);
-  s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64);
   // s11 = cospi_16_64 * (-x10 + x11);
-  s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64);
-  s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[11], t_hi[11], t_lo[10], t_hi[10],
+                                  cospi_16_64, &s_lo[10], &s_hi[10], &s_lo[11],
+                                  &s_hi[11]);
   // s14 = (-cospi_16_64) * (x14 + x15);
-  s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64);
-  s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64);
   // s15 = cospi_16_64 * (x14 - x15);
-  s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64);
-  s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64);
+  butterfly_one_coeff_s32_noround(t_lo[15], t_hi[15], t_lo[14], t_hi[14],
+                                  -cospi_16_64, &s_lo[14], &s_hi[14], &s_lo[15],
+                                  &s_hi[15]);
 
   // final fdct_round_shift
-  t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING);
-  t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING);
-  t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING);
-  t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING);
-  t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING);
-  t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING);
-  t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING);
-  t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING);
-  t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING);
-  t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING);
-  t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING);
-  t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING);
-  t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING);
-  t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING);
-  t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING);
-  t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING);
-
-  x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS);
-  x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS);
-  x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS);
-  x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS);
-  x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS);
-  x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS);
-  x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS);
-  x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS);
-  x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS);
-  x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS);
-  x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS);
-  x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS);
-  x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS);
-  x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS);
-  x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS);
-  x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS);
+  x_lo[2] = vrshrn_n_s32(s_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vrshrn_n_s32(s_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vrshrn_n_s32(s_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vrshrn_n_s32(s_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vrshrn_n_s32(s_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vrshrn_n_s32(s_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vrshrn_n_s32(s_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vrshrn_n_s32(s_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vrshrn_n_s32(s_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vrshrn_n_s32(s_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vrshrn_n_s32(s_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vrshrn_n_s32(s_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vrshrn_n_s32(s_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vrshrn_n_s32(s_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vrshrn_n_s32(s_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vrshrn_n_s32(s_hi[15], DCT_CONST_BITS);
 
   // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
   x_lo[0] = vmovn_s32(t_lo[0]);
@@ -1458,3 +1095,137 @@ void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
       break;
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_load_buffer_4x4(const int16_t *input,
+                                          int32x4_t *in /*[4]*/, int stride) {
+  // { 0, 1, 1, 1 };
+  const int32x4_t nonzero_bias_a = vextq_s32(vdupq_n_s32(0), vdupq_n_s32(1), 3);
+  // { 1, 0, 0, 0 };
+  const int32x4_t nonzero_bias_b = vextq_s32(vdupq_n_s32(1), vdupq_n_s32(0), 3);
+  int32x4_t mask;
+
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s32_u32(vceqq_s32(in[0], nonzero_bias_a));
+  in[0] = vaddq_s32(in[0], mask);
+  in[0] = vaddq_s32(in[0], nonzero_bias_b);
+}
+
+static INLINE void highbd_write_buffer_4x4(tran_low_t *output, int32x4_t *res) {
+  const int32x4_t one = vdupq_n_s32(1);
+  res[0] = vshrq_n_s32(vaddq_s32(res[0], one), 2);
+  res[1] = vshrq_n_s32(vaddq_s32(res[1], one), 2);
+  res[2] = vshrq_n_s32(vaddq_s32(res[2], one), 2);
+  res[3] = vshrq_n_s32(vaddq_s32(res[3], one), 2);
+  vst1q_s32(output + 0 * 4, res[0]);
+  vst1q_s32(output + 1 * 4, res[1]);
+  vst1q_s32(output + 2 * 4, res[2]);
+  vst1q_s32(output + 3 * 4, res[3]);
+}
+
+static INLINE void highbd_fadst4x4_neon(int32x4_t *in /*[4]*/) {
+  int32x2_t s_lo[4], s_hi[4];
+  int64x2_t u_lo[4], u_hi[4], t_lo[4], t_hi[4];
+
+  s_lo[0] = vget_low_s32(in[0]);
+  s_hi[0] = vget_high_s32(in[0]);
+  s_lo[1] = vget_low_s32(in[1]);
+  s_hi[1] = vget_high_s32(in[1]);
+  s_lo[2] = vget_low_s32(in[2]);
+  s_hi[2] = vget_high_s32(in[2]);
+  s_lo[3] = vget_low_s32(in[3]);
+  s_hi[3] = vget_high_s32(in[3]);
+
+  // t0 = s0 * sinpi_1_9 + s1 * sinpi_2_9 + s3 * sinpi_4_9
+  t_lo[0] = vmull_n_s32(s_lo[0], sinpi_1_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[1], sinpi_2_9);
+  t_lo[0] = vmlal_n_s32(t_lo[0], s_lo[3], sinpi_4_9);
+  t_hi[0] = vmull_n_s32(s_hi[0], sinpi_1_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[1], sinpi_2_9);
+  t_hi[0] = vmlal_n_s32(t_hi[0], s_hi[3], sinpi_4_9);
+
+  // t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+  t_lo[1] = vmull_n_s32(s_lo[0], sinpi_3_9);
+  t_lo[1] = vmlal_n_s32(t_lo[1], s_lo[1], sinpi_3_9);
+  t_lo[1] = vmlsl_n_s32(t_lo[1], s_lo[3], sinpi_3_9);
+  t_hi[1] = vmull_n_s32(s_hi[0], sinpi_3_9);
+  t_hi[1] = vmlal_n_s32(t_hi[1], s_hi[1], sinpi_3_9);
+  t_hi[1] = vmlsl_n_s32(t_hi[1], s_hi[3], sinpi_3_9);
+
+  // t2 = s0 * sinpi_4_9 - s1* sinpi_1_9 + s3 * sinpi_2_9
+  t_lo[2] = vmull_n_s32(s_lo[0], sinpi_4_9);
+  t_lo[2] = vmlsl_n_s32(t_lo[2], s_lo[1], sinpi_1_9);
+  t_lo[2] = vmlal_n_s32(t_lo[2], s_lo[3], sinpi_2_9);
+  t_hi[2] = vmull_n_s32(s_hi[0], sinpi_4_9);
+  t_hi[2] = vmlsl_n_s32(t_hi[2], s_hi[1], sinpi_1_9);
+  t_hi[2] = vmlal_n_s32(t_hi[2], s_hi[3], sinpi_2_9);
+
+  // t3 = s2 * sinpi_3_9
+  t_lo[3] = vmull_n_s32(s_lo[2], sinpi_3_9);
+  t_hi[3] = vmull_n_s32(s_hi[2], sinpi_3_9);
+
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u_lo[0] = vaddq_s64(t_lo[0], t_lo[3]);
+  u_hi[0] = vaddq_s64(t_hi[0], t_hi[3]);
+  u_lo[1] = t_lo[1];
+  u_hi[1] = t_hi[1];
+  u_lo[2] = vsubq_s64(t_lo[2], t_lo[3]);
+  u_hi[2] = vsubq_s64(t_hi[2], t_hi[3]);
+  u_lo[3] = vaddq_s64(vsubq_s64(t_lo[2], t_lo[0]), t_lo[3]);
+  u_hi[3] = vaddq_s64(vsubq_s64(t_hi[2], t_hi[0]), t_hi[3]);
+
+  // fdct_round_shift
+  in[0] = vcombine_s32(vrshrn_n_s64(u_lo[0], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[0], DCT_CONST_BITS));
+  in[1] = vcombine_s32(vrshrn_n_s64(u_lo[1], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[1], DCT_CONST_BITS));
+  in[2] = vcombine_s32(vrshrn_n_s64(u_lo[2], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[2], DCT_CONST_BITS));
+  in[3] = vcombine_s32(vrshrn_n_s64(u_lo[3], DCT_CONST_BITS),
+                       vrshrn_n_s64(u_hi[3], DCT_CONST_BITS));
+
+  transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+}
+
+void vp9_highbd_fht4x4_neon(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  int32x4_t in[4];
+  // int i;
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_highbd_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      highbd_load_buffer_4x4(input, in, stride);
+      vpx_highbd_fdct4x4_pass1_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      highbd_load_buffer_4x4(input, in, stride);
+      highbd_fadst4x4_neon(in);
+      highbd_fadst4x4_neon(in);
+      highbd_write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
new file mode 100644
index 000000000..33753f77b
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <arm_neon.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vp9/encoder/vp9_encoder.h"
+#include "vpx_ports/mem.h"
+
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+static INLINE int_mv pack_int_mv(int16_t row, int16_t col) {
+  int_mv result;
+  result.as_mv.row = row;
+  result.as_mv.col = col;
+  return result;
+}
+
+static INLINE MV_JOINT_TYPE get_mv_joint(const int_mv mv) {
+  // This is simplified from the C implementation to utilise that
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[2]  and
+  //  x->nmvjointsadcost[1] == x->nmvjointsadcost[3]
+  return mv.as_int == 0 ? 0 : 1;
+}
+
+static INLINE int mv_cost(const int_mv mv, const int *joint_cost,
+                          int *const comp_cost[2]) {
+  assert(mv.as_mv.row >= -MV_MAX && mv.as_mv.row < MV_MAX);
+  assert(mv.as_mv.col >= -MV_MAX && mv.as_mv.col < MV_MAX);
+  return joint_cost[get_mv_joint(mv)] + comp_cost[0][mv.as_mv.row] +
+         comp_cost[1][mv.as_mv.col];
+}
+
+static int mvsad_err_cost(const MACROBLOCK *x, const int_mv mv, const MV *ref,
+                          int sad_per_bit) {
+  const int_mv diff =
+      pack_int_mv(mv.as_mv.row - ref->row, mv.as_mv.col - ref->col);
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(diff, x->nmvjointsadcost, x->nmvsadcost) * sad_per_bit,
+      VP9_PROB_COST_SHIFT);
+}
+
+/*****************************************************************************
+ * This function utilizes 3 properties of the cost function lookup tables,   *
+ * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
+ * vp9_encoder.c.                                                            *
+ * For the joint cost:                                                       *
+ *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
+ * For the component costs:                                                  *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
+ *         (Equal costs for both components)                                 *
+ *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
+ *         (Cost function is even)                                           *
+ * If these do not hold, then this function cannot be used without           *
+ * modification, in which case you can revert to using the C implementation, *
+ * which does not rely on these properties.                                  *
+ *****************************************************************************/
+int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
+                                const search_site_config *cfg, MV *ref_mv,
+                                MV *best_mv, int search_param, int sad_per_bit,
+                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
+                                const MV *center_mv) {
+  static const uint32_t data[4] = { 0, 1, 2, 3 };
+  const uint32x4_t v_idx_d = vld1q_u32((const uint32_t *)data);
+
+  const int32x4_t zero_s32 = vdupq_n_s32(0);
+  const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
+  const int16x8_t v_max_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(maxmv.as_int));
+  const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
+  const int16x8_t v_min_mv_w = vreinterpretq_s16_s32(vdupq_n_s32(minmv.as_int));
+
+  const int32x4_t v_spb_d = vdupq_n_s32(sad_per_bit);
+
+  const int32x4_t v_joint_cost_0_d = vdupq_n_s32(x->nmvjointsadcost[0]);
+  const int32x4_t v_joint_cost_1_d = vdupq_n_s32(x->nmvjointsadcost[1]);
+
+  // search_param determines the length of the initial step and hence the number
+  // of iterations.
+  // 0 = initial step (MAX_FIRST_STEP) pel
+  // 1 = (MAX_FIRST_STEP/2) pel,
+  // 2 = (MAX_FIRST_STEP/4) pel...
+  const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
+  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
+  const int tot_steps = cfg->total_steps - search_param;
+
+  const int_mv fcenter_mv =
+      pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
+  const int16x8_t vfcmv = vreinterpretq_s16_s32(vdupq_n_s32(fcenter_mv.as_int));
+
+  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
+  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
+
+  int_mv bmv = pack_int_mv(ref_row, ref_col);
+  int_mv new_bmv = bmv;
+  int16x8_t v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+
+  const int what_stride = x->plane[0].src.stride;
+  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
+  const uint8_t *const what = x->plane[0].src.buf;
+  const uint8_t *const in_what =
+      x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col;
+
+  // Work out the start point for the search
+  const uint8_t *best_address = in_what;
+  const uint8_t *new_best_address = best_address;
+#if defined(__aarch64__)
+  int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+  int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+  unsigned int best_sad = INT_MAX;
+  int i, j, step;
+
+  // Check the prerequisite cost function properties that are easy to check
+  // in an assert. See the function-level documentation for details on all
+  // prerequisites.
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
+  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);
+
+  // Check the starting position
+  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
+  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);
+
+  *num00 = 0;
+
+  for (i = 0, step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
+      int16x8_t v_diff_mv_w;
+      int8x16_t v_inside_d;
+      uint32x4_t v_outside_d;
+      int32x4_t v_cost_d, v_sad_d;
+#if defined(__aarch64__)
+      int64x2_t v_blocka[2];
+#else
+      int32x4_t v_blocka[1];
+      uint32x2_t horiz_max_0, horiz_max_1;
+#endif
+
+      uint32_t horiz_max;
+      // Compute the candidate motion vectors
+      const int16x8_t v_ss_mv_w = vld1q_s16((const int16_t *)&ss_mv[i]);
+      const int16x8_t v_these_mv_w = vaddq_s16(v_bmv_w, v_ss_mv_w);
+      // Clamp them to the search bounds
+      int16x8_t v_these_mv_clamp_w = v_these_mv_w;
+      v_these_mv_clamp_w = vminq_s16(v_these_mv_clamp_w, v_max_mv_w);
+      v_these_mv_clamp_w = vmaxq_s16(v_these_mv_clamp_w, v_min_mv_w);
+      // The ones that did not change are inside the search area
+      v_inside_d = vreinterpretq_s8_u32(
+          vceqq_s32(vreinterpretq_s32_s16(v_these_mv_clamp_w),
+                    vreinterpretq_s32_s16(v_these_mv_w)));
+
+      // If none of them are inside, then move on
+#if defined(__aarch64__)
+      horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
+#else
+      horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
+                             vget_high_u32(vreinterpretq_u32_s8(v_inside_d)));
+      horiz_max_1 = vpmax_u32(horiz_max_0, horiz_max_0);
+      vst1_lane_u32(&horiz_max, horiz_max_1, 0);
+#endif
+      if (LIKELY(horiz_max == 0)) {
+        continue;
+      }
+
+      // The inverse mask indicates which of the MVs are outside
+      v_outside_d =
+          vreinterpretq_u32_s8(veorq_s8(v_inside_d, vdupq_n_s8((int8_t)0xff)));
+      // Shift right to keep the sign bit clear, we will use this later
+      // to set the cost to the maximum value.
+      v_outside_d = vshrq_n_u32(v_outside_d, 1);
+
+      // Compute the difference MV
+      v_diff_mv_w = vsubq_s16(v_these_mv_clamp_w, vfcmv);
+      // We utilise the fact that the cost function is even, and use the
+      // absolute difference. This allows us to use unsigned indexes later
+      // and reduces cache pressure somewhat as only a half of the table
+      // is ever referenced.
+      v_diff_mv_w = vabsq_s16(v_diff_mv_w);
+
+      // Compute the SIMD pointer offsets.
+      {
+#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+        // Load the offsets
+        int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
+        int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
+        // Set the ones falling outside to zero
+        v_bo10_q = vandq_s64(
+            v_bo10_q,
+            vmovl_s32(vget_low_s32(vreinterpretq_s32_s8(v_inside_d))));
+        v_bo32_q = vandq_s64(
+            v_bo32_q,
+            vmovl_s32(vget_high_s32(vreinterpretq_s32_s8(v_inside_d))));
+        // Compute the candidate addresses
+        v_blocka[0] = vaddq_s64(v_ba_q, v_bo10_q);
+        v_blocka[1] = vaddq_s64(v_ba_q, v_bo32_q);
+#else  // sizeof(intptr_t) == 4
+        int32x4_t v_bo_d = vld1q_s32((const int32_t *)&ss_os[i]);
+        v_bo_d = vandq_s32(v_bo_d, vreinterpretq_s32_s8(v_inside_d));
+        v_blocka[0] = vaddq_s32(v_ba_d, v_bo_d);
+#endif
+      }
+
+      fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0],
+                     in_what_stride, (uint32_t *)&v_sad_d);
+
+      // Look up the component cost of the residual motion vector
+      {
+        uint32_t cost[4];
+        int16_t __attribute__((aligned(16))) rowcol[8];
+        vst1q_s16(rowcol, v_diff_mv_w);
+
+        // Note: This is a use case for gather instruction
+        cost[0] = x->nmvsadcost[0][rowcol[0]] + x->nmvsadcost[0][rowcol[1]];
+        cost[1] = x->nmvsadcost[0][rowcol[2]] + x->nmvsadcost[0][rowcol[3]];
+        cost[2] = x->nmvsadcost[0][rowcol[4]] + x->nmvsadcost[0][rowcol[5]];
+        cost[3] = x->nmvsadcost[0][rowcol[6]] + x->nmvsadcost[0][rowcol[7]];
+
+        v_cost_d = vld1q_s32((int32_t *)cost);
+      }
+
+      // Now add in the joint cost
+      {
+        const uint32x4_t v_sel_d =
+            vceqq_s32(vreinterpretq_s32_s16(v_diff_mv_w), zero_s32);
+        const int32x4_t v_joint_cost_d = vreinterpretq_s32_u8(
+            vbslq_u8(vreinterpretq_u8_u32(v_sel_d),
+                     vreinterpretq_u8_s32(v_joint_cost_0_d),
+                     vreinterpretq_u8_s32(v_joint_cost_1_d)));
+        v_cost_d = vaddq_s32(v_cost_d, v_joint_cost_d);
+      }
+
+      // Multiply by sad_per_bit
+      v_cost_d = vmulq_s32(v_cost_d, v_spb_d);
+      // ROUND_POWER_OF_TWO(v_cost_d, VP9_PROB_COST_SHIFT)
+      v_cost_d =
+          vaddq_s32(v_cost_d, vdupq_n_s32(1 << (VP9_PROB_COST_SHIFT - 1)));
+      v_cost_d = vshrq_n_s32(v_cost_d, VP9_PROB_COST_SHIFT);
+      // Add the cost to the sad
+      v_sad_d = vaddq_s32(v_sad_d, v_cost_d);
+
+      // Make the motion vectors outside the search area have max cost
+      // by or'ing in the comparison mask, this way the minimum search won't
+      // pick them.
+      v_sad_d = vorrq_s32(v_sad_d, vreinterpretq_s32_u32(v_outside_d));
+
+      // Find the minimum value and index horizontally in v_sad_d
+      {
+        uint32_t local_best_sad;
+#if defined(__aarch64__)
+        local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
+#else
+        uint32x2_t horiz_min_0 =
+            vmin_u32(vget_low_u32(vreinterpretq_u32_s32(v_sad_d)),
+                     vget_high_u32(vreinterpretq_u32_s32(v_sad_d)));
+        uint32x2_t horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+        vst1_lane_u32(&local_best_sad, horiz_min_1, 0);
+#endif
+
+        // Update the global minimum if the local minimum is smaller
+        if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+          uint32_t local_best_idx;
+          const uint32x4_t v_sel_d =
+              vceqq_s32(v_sad_d, vdupq_n_s32(local_best_sad));
+          uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
+          v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
+
+#if defined(__aarch64__)
+          local_best_idx = vminvq_u32(v_mask_d);
+#else
+          horiz_min_0 =
+              vmin_u32(vget_low_u32(v_mask_d), vget_high_u32(v_mask_d));
+          horiz_min_1 = vpmin_u32(horiz_min_0, horiz_min_0);
+          vst1_lane_u32(&local_best_idx, horiz_min_1, 0);
+#endif
+
+          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
+
+          best_sad = local_best_sad;
+        }
+      }
+    }
+
+    bmv = new_bmv;
+    best_address = new_best_address;
+
+    v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
+#if defined(__aarch64__)
+    v_ba_q = vdupq_n_s64((intptr_t)best_address);
+#else
+    v_ba_d = vdupq_n_s32((intptr_t)best_address);
+#endif
+
+    if (UNLIKELY(best_address == in_what)) {
+      (*num00)++;
+    }
+  }
+
+  *best_mv = bmv.as_mv;
+  return best_sad;
+}
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
index e46f789ba..bc8dd4a34 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c
@@ -14,6 +14,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_dsp/vpx_filter.h"
@@ -710,8 +711,8 @@ void vp9_scale_and_extend_frame_neon(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index 236c3176c..c2b55fcba 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -26,9 +26,8 @@
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
-                                               const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+static VPX_FORCE_INLINE void calculate_dqcoeff_and_store(
+    const int16x8_t qcoeff, const int16x8_t dequant, tran_low_t *dqcoeff) {
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
@@ -42,6 +41,82 @@ static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
+                                                   int16x8_t v_eobmax,
+                                                   uint16x8_t v_nz_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan_ptr[0]);
+  const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, vdupq_n_s16(0), v_iscan);
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#ifdef __aarch64__
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif  // __aarch64__
+}
+
+static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
+                                            const int16_t *quant_ptr,
+                                            const int16_t *dequant_ptr,
+                                            int16x8_t *round, int16x8_t *quant,
+                                            int16x8_t *dequant) {
+  *round = vld1q_s16(round_ptr);
+  *quant = vld1q_s16(quant_ptr);
+  *dequant = vld1q_s16(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
+                                              int16x8_t *v_quant,
+                                              int16x8_t *v_dequant) {
+#ifdef __aarch64__
+  *v_round = vdupq_laneq_s16(*v_round, 1);
+  *v_quant = vdupq_laneq_s16(*v_quant, 1);
+  *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
+#else
+  *v_round = vdupq_lane_s16(vget_low_s16(*v_round), 1);
+  *v_quant = vdupq_lane_s16(vget_low_s16(*v_quant), 1);
+  *v_dequant = vdupq_lane_s16(vget_low_s16(*v_dequant), 1);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const tran_low_t *coeff_ptr,
+    const int16_t *iscan_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    int16x8_t *v_eobmax) {
+  const int16x8_t v_zero = vdupq_n_s16(0);
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_tmp = vqaddq_s16(v_abs, *v_round);
+  const int32x4_t v_tmp_lo =
+      vmull_s16(vget_low_s16(v_tmp), vget_low_s16(*v_quant));
+  const int32x4_t v_tmp_hi =
+      vmull_s16(vget_high_s16(v_tmp), vget_high_s16(*v_quant));
+  const int16x8_t v_tmp2 =
+      vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
+  const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+  const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+  const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+  calculate_dqcoeff_and_store(v_qcoeff, *v_dequant, dqcoeff_ptr);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -50,136 +125,54 @@ void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
-  const int16x8_t v_zero = vdupq_n_s16(0);
-  const int16x8_t v_one = vdupq_n_s16(1);
-  int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
-  int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
-  int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
-  int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
-
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  int16x8_t v_round, v_quant, v_dequant;
   (void)scan;
 
-  // adjust for dc
-  v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
-  v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
-  v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+  load_fp_values(round_ptr, quant_ptr, dequant_ptr, &v_round, &v_quant,
+                 &v_dequant);
   // process dc and the first seven ac coeffs
-  {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
-    v_round = vmovq_n_s16(round_ptr[1]);
-    v_quant = vmovq_n_s16(quant_ptr[1]);
-    v_dequant = vmovq_n_s16(dequant_ptr[1]);
-  }
+  quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr, iscan, qcoeff_ptr,
+                dqcoeff_ptr, &v_eobmax);
+
   // now process the rest of the ac coeffs
+  update_fp_values(&v_round, &v_quant, &v_dequant);
   for (i = 8; i < count; i += 8) {
-    const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
-    const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
-    const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
-    const int16x8_t v_abs = vabsq_s16(v_coeff);
-    const int16x8_t v_tmp = vqaddq_s16(v_abs, v_round);
-    const int32x4_t v_tmp_lo =
-        vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant));
-    const int32x4_t v_tmp_hi =
-        vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant));
-    const int16x8_t v_tmp2 =
-        vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16));
-    const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
-    const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
-    const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1);
-    const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
-    const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
-    calculate_dqcoeff_and_store(v_qcoeff, v_dequant, dqcoeff_ptr + i);
-    v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
-    store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff);
+    quantize_fp_8(&v_round, &v_quant, &v_dequant, coeff_ptr + i, iscan + i,
+                  qcoeff_ptr + i, dqcoeff_ptr + i, &v_eobmax);
   }
-#ifdef __aarch64__
-  *eob_ptr = vmaxvq_s16(v_eobmax_76543210);
-#else
-  {
-    const int16x4_t v_eobmax_3210 = vmax_s16(vget_low_s16(v_eobmax_76543210),
-                                             vget_high_s16(v_eobmax_76543210));
-    const int64x1_t v_eobmax_xx32 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
-    const int16x4_t v_eobmax_tmp =
-        vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
-    const int64x1_t v_eobmax_xxx3 =
-        vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
-    const int16x4_t v_eobmax_final =
-        vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
-
-    *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-  }
-#endif  // __aarch64__
+
+  *eob_ptr = get_max_eob(v_eobmax);
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
   return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
 }
 
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const int16x8_t neg_one = vdupq_n_s16(-1);
-
-  // ROUND_POWER_OF_TWO(round_ptr[], 1)
-  const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-  const int16x8_t quant = vld1q_s16(quant_ptr);
-  const int16x4_t dequant = vld1_s16(dequant_ptr);
-  // dequant >> 2 is used similar to zbin as a threshold.
-  const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+static VPX_FORCE_INLINE void quantize_fp_32x32_8(
+    const int16x8_t *v_round, const int16x8_t *v_quant,
+    const int16x8_t *v_dequant, const int16x8_t *dequant_thresh,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, int16x8_t *v_eobmax) {
+  const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+  const int16x8_t v_coeff_abs = vabsq_s16(v_coeff);
+  const int16x8_t v_thr_mask =
+      vreinterpretq_s16_u16(vcgeq_s16(v_coeff_abs, *dequant_thresh));
+  const int16x8_t v_tmp_rnd =
+      vandq_s16(vqaddq_s16(v_coeff_abs, *v_round), v_thr_mask);
+  const int16x8_t v_abs_qcoeff = vqdmulhq_s16(v_tmp_rnd, *v_quant);
+  const int16x8_t v_qcoeff =
+      vsubq_s16(veorq_s16(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const uint16x8_t v_nz_mask = vceqq_s16(v_abs_qcoeff, vdupq_n_s16(0));
 
-  // Process dc and the first seven ac coeffs.
-  const uint16x8_t v_iscan =
-      vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-  const int16x8_t coeff_abs = vabsq_s16(coeff);
-  const int16x8_t dequant_mask =
-      vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-  int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
   int32x4_t dqcoeff_0, dqcoeff_1;
-  uint16x8_t eob_max;
-  (void)scan;
-  (void)count;
-
-  // coeff * quant_ptr[]) >> 15
-  qcoeff = vqdmulhq_s16(qcoeff, quant);
-
-  // Restore sign.
-  qcoeff = veorq_s16(qcoeff, coeff_sign);
-  qcoeff = vsubq_s16(qcoeff, coeff_sign);
-  qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-  // qcoeff * dequant[] / 2
-  dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), dequant);
-  dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
+  dqcoeff_0 = vmull_s16(vget_low_s16(v_qcoeff), vget_low_s16(*v_dequant));
+  dqcoeff_1 = vmull_s16(vget_high_s16(v_qcoeff), vget_high_s16(*v_dequant));
   // Add 1 if negative to round towards zero because the C uses division.
   dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
   dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
 #if CONFIG_VP9_HIGHBITDEPTH
   vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
   vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
@@ -188,76 +181,228 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
                                                    vshrn_n_s32(dqcoeff_1, 1)));
 #endif
 
-  eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+  store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff);
+
+  *v_eobmax = get_max_lane_eob(iscan_ptr, *v_eobmax, v_nz_mask);
+}
+
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int16x8_t eob_max = vdupq_n_s16(-1);
+  // ROUND_POWER_OF_TWO(round_ptr[], 1)
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
+  // dequant >> 2 is used similar to zbin as a threshold.
+  int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2);
+  int i;
+
+  (void)scan;
+  (void)count;
+
+  // Process dc and the first seven ac coeffs.
+  quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                      iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
 
-  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  update_fp_values(&round, &quant, &dequant);
+  dequant_thresh = vdupq_lane_s16(vget_low_s16(dequant_thresh), 1);
 
   iscan += 8;
   coeff_ptr += 8;
   qcoeff_ptr += 8;
   dqcoeff_ptr += 8;
 
-  {
-    int i;
-    const int16x8_t round = vrshrq_n_s16(vmovq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vmovq_n_s16(quant_ptr[1]);
-    const int16x8_t dequant_thresh =
-        vshrq_n_s16(vmovq_n_s16(dequant_ptr[1]), 2);
-
-    // Process the rest of the ac coeffs.
-    for (i = 8; i < 32 * 32; i += 8) {
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-      const int16x8_t dequant_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, dequant_thresh));
-
-      int16x8_t qcoeff = vqaddq_s16(coeff_abs, round);
-      int32x4_t dqcoeff_0, dqcoeff_1;
-
-      qcoeff = vqdmulhq_s16(qcoeff, quant);
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-      qcoeff = vandq_s16(qcoeff, dequant_mask);
-
-      dqcoeff_0 = vmull_n_s16(vget_low_s16(qcoeff), dequant_ptr[1]);
-      dqcoeff_1 = vmull_n_s16(vget_high_s16(qcoeff), dequant_ptr[1]);
-
-      dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
-      dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+  // Process the rest of the ac coeffs.
+  for (i = 8; i < 32 * 32; i += 8) {
+    quantize_fp_32x32_8(&round, &quant, &dequant, &dequant_thresh, coeff_ptr,
+                        iscan, qcoeff_ptr, dqcoeff_ptr, &eob_max);
+
+    iscan += 8;
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
 
 #if CONFIG_VP9_HIGHBITDEPTH
-      vst1q_s32(dqcoeff_ptr, vshrq_n_s32(dqcoeff_0, 1));
-      vst1q_s32(dqcoeff_ptr + 4, vshrq_n_s32(dqcoeff_1, 1));
-#else
-      store_s16q_to_tran_low(
-          dqcoeff_ptr,
-          vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
-#endif
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                     int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int abs_qcoeff = (int)((tmp * quant) >> 16);
+  const int32x4_t v_abs_qcoeff = vqdmulhq_s32(v_tmp, v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  const int32x4_t v_abs_dqcoeff = vmulq_s32(v_abs_qcoeff, v_dequant_s32);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
+}
 
-      eob_max =
-          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+void vp9_highbd_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_round = vld1_s16(round_ptr);
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
 
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+  (void)scan;
 
-      iscan += 8;
-      coeff_ptr += 8;
-      qcoeff_ptr += 8;
-      dqcoeff_ptr += 8;
-    }
+  // DC and first 3 AC
+  v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = highbd_quantize_fp_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                     v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi =
+        highbd_quantize_fp_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                             v_quant_s32, v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
 
-#ifdef __aarch64__
-    *eob_ptr = vmaxvq_u16(eob_max);
-#else
-    {
-      const uint16x4_t eob_max_0 =
-          vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
-      const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
-      const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
-      vst1_lane_u16(eob_ptr, eob_max_2, 0);
-    }
-#endif  // __aarch64__
-  }
+static VPX_FORCE_INLINE uint16x4_t
+highbd_quantize_fp_32x32_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+                           int32x4_t v_dequant_s32, int32x4_t v_round_s32) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // ((abs_coeff << (1 + log_scale)) >= dequant_ptr[rc01])
+  const int32x4_t v_abs_coeff_scaled = vshlq_n_s32(v_abs_coeff, 2);
+  const uint32x4_t v_mask = vcgeq_s32(v_abs_coeff_scaled, v_dequant_s32);
+  // const int64_t tmp = vmask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const int32x4_t v_tmp = vandq_s32(vaddq_s32(v_abs_coeff, v_round_s32),
+                                    vreinterpretq_s32_u32(v_mask));
+  //  const int abs_qcoeff = (int)((tmp * quant) >> (16 - log_scale));
+  const int32x4_t v_abs_qcoeff =
+      vqdmulhq_s32(vshlq_n_s32(v_tmp, 1), v_quant_s32);
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshrq_n_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), 1);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Packed nz_qcoeff_mask. Used to find eob.
+  return vmovn_u32(vceqq_s32(v_abs_qcoeff, vdupq_n_s32(0)));
 }
+
+void vp9_highbd_quantize_fp_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const int16x4_t v_round =
+      vqrdmulh_n_s16(vld1_s16(round_ptr), (int16_t)(1 << 14));
+  int32x4_t v_round_s32 = vaddl_s16(v_round, v_zero);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vaddl_s16(v_quant, v_zero), 15);
+  int32x4_t v_dequant_s32 = vaddl_s16(v_dequant, v_zero);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  (void)scan;
+
+  // DC and first 3 AC
+  v_mask_lo =
+      highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+
+  // 4 more AC
+  v_mask_hi =
+      highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                                 v_quant_s32, v_dequant_s32, v_round_s32);
+
+  // Find the max lane eob for the first 8 coeffs.
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  n_coeffs -= 8;
+  do {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo =
+        highbd_quantize_fp_32x32_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                   v_quant_s32, v_dequant_s32, v_round_s32);
+    v_mask_hi = highbd_quantize_fp_32x32_4(coeff_ptr + 4, qcoeff_ptr + 4,
+                                           dqcoeff_ptr + 4, v_quant_s32,
+                                           v_dequant_s32, v_round_s32);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+    n_coeffs -= 8;
+  } while (n_coeffs);
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index e336179e9..28ab10a13 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -471,7 +471,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) {
   cr->sb_index = i;
   cr->reduce_refresh = 0;
   if (cpi->oxcf.content != VP9E_CONTENT_SCREEN)
-    if (count_sel<(3 * count_tot)>> 2) cr->reduce_refresh = 1;
+    if (count_sel < (3 * count_tot) >> 2) cr->reduce_refresh = 1;
 }
 
 // Set cyclic refresh parameters.
@@ -558,7 +558,7 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
     cr->percent_refresh = 10;
     cr->rate_ratio_qdelta = 1.5;
     cr->rate_boost_fac = 10;
-    if (cpi->refresh_golden_frame == 1) {
+    if (cpi->refresh_golden_frame == 1 && !cpi->use_svc) {
       cr->percent_refresh = 0;
       cr->rate_ratio_qdelta = 1.0;
     }
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 75bd097f2..a84c8b524 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -134,9 +134,9 @@ static void pack_mb_tokens(vpx_writer *w, TOKENEXTRA **tp,
   const TOKENEXTRA *p;
   const vp9_extra_bit *const extra_bits =
 #if CONFIG_VP9_HIGHBITDEPTH
-      (bit_depth == VPX_BITS_12)
-          ? vp9_extra_bits_high12
-          : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10 : vp9_extra_bits;
+      (bit_depth == VPX_BITS_12)   ? vp9_extra_bits_high12
+      : (bit_depth == VPX_BITS_10) ? vp9_extra_bits_high10
+                                   : vp9_extra_bits;
 #else
       vp9_extra_bits;
   (void)bit_depth;
diff --git a/libvpx/vp9/encoder/vp9_denoiser.c b/libvpx/vp9/encoder/vp9_denoiser.c
index 2885223b5..77d72396a 100644
--- a/libvpx/vp9/encoder/vp9_denoiser.c
+++ b/libvpx/vp9/encoder/vp9_denoiser.c
@@ -233,7 +233,7 @@ static VP9_DENOISER_DECISION perform_motion_compensation(
         frame == ALTREF_FRAME ||
         (frame == GOLDEN_FRAME && use_gf_temporal_ref) ||
         (frame != LAST_FRAME &&
-         ((ctx->zeromv_lastref_sse<(5 * ctx->zeromv_sse)>> 2) ||
+         ((ctx->zeromv_lastref_sse < (5 * ctx->zeromv_sse) >> 2) ||
           denoiser->denoising_level >= kDenHigh))) {
       frame = LAST_FRAME;
       ctx->newmv_sse = ctx->zeromv_lastref_sse;
@@ -764,8 +764,9 @@ int64_t vp9_scale_acskip_thresh(int64_t threshold,
                                 VP9_DENOISER_LEVEL noise_level, int abs_sumdiff,
                                 int temporal_layer_id) {
   if (noise_level >= kDenLow && abs_sumdiff < 5)
-    return threshold *=
-           (noise_level == kDenLow) ? 2 : (temporal_layer_id == 2) ? 10 : 6;
+    return threshold *= (noise_level == kDenLow)   ? 2
+                        : (temporal_layer_id == 2) ? 10
+                                                   : 6;
   else
     return threshold;
 }
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index a9f392bf5..1483ac069 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -1299,7 +1299,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
   // the reference (base layer frame) is key frame (i.e., is_key_frame == 1).
   int is_key_frame =
       (frame_is_intra_only(cm) ||
-       (is_one_pass_cbr_svc(cpi) &&
+       (is_one_pass_svc(cpi) &&
         cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame));
   // Always use 4x4 partition for key frame.
   const int use_4x4_partition = frame_is_intra_only(cm);
@@ -1406,7 +1406,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
@@ -3413,7 +3413,8 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   const VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
-  const YV12_BUFFER_CONFIG *const yv12 = get_ref_frame_buffer(cpi, ref);
+  YV12_BUFFER_CONFIG *yv12;
+  YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, ref);
   const int step_param = 1;
   const MvLimits tmp_mv_limits = x->mv_limits;
   const SEARCH_METHODS search_method = NSTEP;
@@ -3422,6 +3423,11 @@ static void simple_motion_search(const VP9_COMP *const cpi, MACROBLOCK *const x,
   MV best_mv = { 0, 0 };
   int cost_list[5];
 
+  if (scaled_ref_frame)
+    yv12 = scaled_ref_frame;
+  else
+    yv12 = get_ref_frame_buffer(cpi, ref);
+
   assert(yv12 != NULL);
   if (!yv12) return;
   vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col,
@@ -5381,7 +5387,7 @@ static void get_estimated_pred(VP9_COMP *cpi, const TileInfo *const tile,
 
     assert(yv12 != NULL);
 
-    if (!(is_one_pass_cbr_svc(cpi) && cpi->svc.spatial_layer_id) ||
+    if (!(is_one_pass_svc(cpi) && cpi->svc.spatial_layer_id) ||
         cpi->svc.use_gf_temporal_ref_current_layer) {
       // For now, GOLDEN will not be used for non-zero spatial layers, since
       // it may not be a temporal reference.
diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index d3f4d1ea8..b66fdc0bc 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c
@@ -1333,7 +1333,7 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
   // For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
   // buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
   // target of 1/4x1/4. number_spatial_layers must be greater than 2.
-  if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+  if (is_one_pass_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
       cpi->svc.number_spatial_layers > 2) {
     cpi->svc.scaled_temp_is_alloc = 1;
     if (vpx_realloc_frame_buffer(
@@ -1511,7 +1511,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   // Temporal scalability.
   cpi->svc.number_temporal_layers = oxcf->ts_number_layers;
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -1527,6 +1527,7 @@ static void init_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
   init_buffer_indices(cpi);
 
   vp9_noise_estimate_init(&cpi->noise_estimate, cm->width, cm->height);
+  cpi->fixed_qp_onepass = 0;
 }
 
 void vp9_check_reset_rc_flag(VP9_COMP *cpi) {
@@ -2077,7 +2078,7 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
     rc->rc_2_frame = 0;
   }
 
-  if ((cpi->svc.number_temporal_layers > 1 && cpi->oxcf.rc_mode == VPX_CBR) ||
+  if ((cpi->svc.number_temporal_layers > 1) ||
       ((cpi->svc.number_temporal_layers > 1 ||
         cpi->svc.number_spatial_layers > 1) &&
        cpi->oxcf.pass != 1)) {
@@ -3263,7 +3264,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
   vp9_denoiser_update_ref_frame(cpi);
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) vp9_svc_update_ref_frame(cpi);
+  if (is_one_pass_svc(cpi)) vp9_svc_update_ref_frame(cpi);
 }
 
 static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) {
@@ -3857,11 +3858,11 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   int q = 0, bottom_index = 0, top_index = 0;
   int no_drop_scene_change = 0;
   const INTERP_FILTER filter_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_type[svc->spatial_layer_id]
           : EIGHTTAP;
   const int phase_scaler =
-      (is_one_pass_cbr_svc(cpi))
+      (is_one_pass_svc(cpi))
           ? svc->downsample_filter_phase[svc->spatial_layer_id]
           : 0;
 
@@ -3882,7 +3883,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
 
   set_frame_size(cpi);
 
-  if (is_one_pass_cbr_svc(cpi) &&
+  if (is_one_pass_svc(cpi) &&
       cpi->un_scaled_source->y_width == cm->width << 2 &&
       cpi->un_scaled_source->y_height == cm->height << 2 &&
       svc->scaled_temp.y_width == cm->width << 1 &&
@@ -3896,7 +3897,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
         cm, cpi->un_scaled_source, &cpi->scaled_source, &svc->scaled_temp,
         filter_scaler, phase_scaler, filter_scaler2, phase_scaler2);
     svc->scaled_one_half = 1;
-  } else if (is_one_pass_cbr_svc(cpi) &&
+  } else if (is_one_pass_svc(cpi) &&
              cpi->un_scaled_source->y_width == cm->width << 1 &&
              cpi->un_scaled_source->y_height == cm->height << 1 &&
              svc->scaled_one_half) {
@@ -3911,7 +3912,7 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
   }
 #ifdef OUTPUT_YUV_SVC_SRC
   // Write out at most 3 spatial layers.
-  if (is_one_pass_cbr_svc(cpi) && svc->spatial_layer_id < 3) {
+  if (is_one_pass_svc(cpi) && svc->spatial_layer_id < 3) {
     vpx_write_yuv_frame(yuv_svc_src[svc->spatial_layer_id], cpi->Source);
   }
 #endif
@@ -4020,14 +4021,14 @@ static int encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
     if (vp9_rc_drop_frame(cpi)) return 0;
   }
 
-  // For 1 pass CBR SVC, only ZEROMV is allowed for spatial reference frame
+  // For 1 pass SVC, only ZEROMV is allowed for spatial reference frame
   // when svc->force_zero_mode_spatial_ref = 1. Under those conditions we can
   // avoid this frame-level upsampling (for non intra_only frames).
   // For SVC single_layer mode, dynamic resize is allowed and we need to
   // scale references for this case.
   if (frame_is_intra_only(cm) == 0 &&
       ((svc->single_layer_svc && cpi->oxcf.resize_mode == RESIZE_DYNAMIC) ||
-       !(is_one_pass_cbr_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
+       !(is_one_pass_svc(cpi) && svc->force_zero_mode_spatial_ref))) {
     vp9_scale_references(cpi);
   }
 
@@ -4367,7 +4368,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
-  int last_q_attempt = 0;
   int enable_acl;
 #ifdef AGGRESSIVE_VBR
   int qrange_adj = 1;
@@ -4381,8 +4381,18 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
   // Maximal frame size allowed by the external rate control.
   // case: 0, we ignore the max frame size limit, and encode with the qindex
   // passed in by the external rate control model.
-  // case: -1, we take VP9's decision for the max frame size.
+  // If the external qindex is VPX_DEFAULT_Q, libvpx will pick a qindex
+  // and may recode if undershoot/overshoot is seen.
+  // If the external qindex is not VPX_DEFAULT_Q, we force no recode.
+  // case: -1, we take libvpx's decision for the max frame size, as well as
+  // the recode decision.
+  // Otherwise: if a specific size is given, libvpx's recode decision
+  // will respect the given size.
   int ext_rc_max_frame_size = 0;
+  // Use VP9's decision of qindex. This flag is in use only in external rate
+  // control model to help determine whether to recode when
+  // |ext_rc_max_frame_size| is 0.
+  int ext_rc_use_default_q = 1;
   const int orig_rc_max_frame_bandwidth = rc->max_frame_bandwidth;
 
 #if CONFIG_RATE_CTRL
@@ -4491,7 +4501,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       }
     }
 #endif  // CONFIG_RATE_CTRL
-    if (cpi->ext_ratectrl.ready && !ext_rc_recode) {
+    if (cpi->ext_ratectrl.ready && !ext_rc_recode &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
       vpx_codec_err_t codec_status;
       const GF_GROUP *gf_group = &cpi->twopass.gf_group;
       vpx_rc_encodeframe_decision_t encode_frame_decision;
@@ -4500,16 +4511,27 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
       const RefCntBuffer *curr_frame_buf =
           get_ref_cnt_buffer(cm, cm->new_fb_idx);
+      // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+      // index 1 refers to the first encoding frame in a gf group.
+      // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+      // See function define_gf_group_structure().
+      const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
       get_ref_frame_bufs(cpi, ref_frame_bufs);
       codec_status = vp9_extrc_get_encodeframe_decision(
           &cpi->ext_ratectrl, curr_frame_buf->frame_index,
           cm->current_frame_coding_index, gf_group->index, update_type,
-          ref_frame_bufs, ref_frame_flags, &encode_frame_decision);
+          gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+          &encode_frame_decision);
       if (codec_status != VPX_CODEC_OK) {
         vpx_internal_error(&cm->error, codec_status,
                            "vp9_extrc_get_encodeframe_decision() failed");
       }
-      q = encode_frame_decision.q_index;
+      // If the external model recommends a reserved value, we use
+      // libvpx's default q.
+      if (encode_frame_decision.q_index != VPX_DEFAULT_Q) {
+        q = encode_frame_decision.q_index;
+        ext_rc_use_default_q = 0;
+      }
       ext_rc_max_frame_size = encode_frame_decision.max_frame_size;
     }
 
@@ -4551,8 +4573,8 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
     }
 
-    if (cpi->ext_ratectrl.ready) {
-      last_q_attempt = q;
+    if (cpi->ext_ratectrl.ready &&
+        (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_QP) != 0) {
       // In general, for the external rate control, we take the qindex provided
       // as input and encode the frame with this qindex faithfully. However,
       // in some extreme scenarios, the provided qindex leads to a massive
@@ -4560,20 +4582,13 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
       // to pick a new qindex and recode the frame. We return the new qindex
       // through the API to the external model.
       if (ext_rc_max_frame_size == 0) {
-        break;
+        if (!ext_rc_use_default_q) break;
       } else if (ext_rc_max_frame_size == -1) {
-        if (rc->projected_frame_size < rc->max_frame_bandwidth) {
-          break;
-        }
+        // Do nothing, fall back to libvpx's recode decision.
       } else {
-        if (rc->projected_frame_size < ext_rc_max_frame_size) {
-          break;
-        }
+        // Change the max frame size, used in libvpx's recode decision.
+        rc->max_frame_bandwidth = ext_rc_max_frame_size;
       }
-      rc->max_frame_bandwidth = ext_rc_max_frame_size;
-      // If the current frame size exceeds the ext_rc_max_frame_size,
-      // we adjust the worst qindex to meet the frame size constraint.
-      q_high = 255;
       ext_rc_recode = 1;
     }
 #if CONFIG_RATE_CTRL
@@ -4776,23 +4791,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest
         rc->projected_frame_size < rc->max_frame_bandwidth)
       loop = 0;
 
-    // Special handling of external max frame size constraint
-    if (ext_rc_recode) {
-      // If the largest q is not able to meet the max frame size limit,
-      // do nothing.
-      if (rc->projected_frame_size > ext_rc_max_frame_size &&
-          last_q_attempt == 255) {
-        break;
-      }
-      // If VP9's q selection leads to a smaller q, we force it to use
-      // a larger q to better approximate the external max frame size
-      // constraint.
-      if (rc->projected_frame_size > ext_rc_max_frame_size &&
-          q <= last_q_attempt) {
-        q = VPXMIN(255, last_q_attempt + 1);
-      }
-    }
-
     if (loop) {
       ++loop_count;
       ++loop_at_this_size;
@@ -5518,6 +5516,32 @@ static void encode_frame_to_data_rate(
     save_encode_params(cpi);
   }
 #endif
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0) {
+    vpx_codec_err_t codec_status;
+    const GF_GROUP *gf_group = &cpi->twopass.gf_group;
+    FRAME_UPDATE_TYPE update_type = gf_group->update_type[gf_group->index];
+    const int ref_frame_flags = get_ref_frame_flags(cpi);
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES];
+    const RefCntBuffer *curr_frame_buf = get_ref_cnt_buffer(cm, cm->new_fb_idx);
+    // index 0 of a gf group is always KEY/OVERLAY/GOLDEN.
+    // index 1 refers to the first encoding frame in a gf group.
+    // Therefore if it is ARF_UPDATE, it means this gf group uses alt ref.
+    // See function define_gf_group_structure().
+    const int use_alt_ref = gf_group->update_type[1] == ARF_UPDATE;
+    int ext_rdmult = VPX_DEFAULT_RDMULT;
+    get_ref_frame_bufs(cpi, ref_frame_bufs);
+    codec_status = vp9_extrc_get_frame_rdmult(
+        &cpi->ext_ratectrl, curr_frame_buf->frame_index,
+        cm->current_frame_coding_index, gf_group->index, update_type,
+        gf_group->gf_group_size, use_alt_ref, ref_frame_bufs, ref_frame_flags,
+        &ext_rdmult);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_frame_rdmult() failed");
+    }
+    cpi->ext_ratectrl.ext_rdmult = ext_rdmult;
+  }
 
   if (cpi->sf.recode_loop == DISALLOW_RECODE) {
     if (!encode_without_recode_loop(cpi, size, dest)) return;
@@ -5593,7 +5617,7 @@ static void encode_frame_to_data_rate(
   // build the bitstream
   vp9_pack_bitstream(cpi, dest, size);
 
-  {
+  if (cpi->ext_ratectrl.ready) {
     const RefCntBuffer *coded_frame_buf =
         get_ref_cnt_buffer(cm, cm->new_fb_idx);
     vpx_codec_err_t codec_status = vp9_extrc_update_encodeframe_result(
@@ -5800,16 +5824,6 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, uint8_t *dest,
                         unsigned int *frame_flags,
                         ENCODE_FRAME_RESULT *encode_frame_result) {
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
-
-  if (cpi->common.current_frame_coding_index == 0) {
-    VP9_COMMON *cm = &cpi->common;
-    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
-        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
-    if (codec_status != VPX_CODEC_OK) {
-      vpx_internal_error(&cm->error, codec_status,
-                         "vp9_extrc_send_firstpass_stats() failed");
-    }
-  }
 #if CONFIG_MISMATCH_DEBUG
   mismatch_move_frame_idx_w();
 #endif
@@ -7626,8 +7640,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   const int gf_group_index = cpi->twopass.gf_group.index;
   int i;
 
-  if (is_one_pass_cbr_svc(cpi)) {
-    vp9_one_pass_cbr_svc_start_layer(cpi);
+  if (is_one_pass_svc(cpi)) {
+    vp9_one_pass_svc_start_layer(cpi);
   }
 
   vpx_usec_timer_start(&cmptimer);
@@ -7647,7 +7661,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   // Normal defaults
   cm->reset_frame_context = 0;
   cm->refresh_frame_context = 1;
-  if (!is_one_pass_cbr_svc(cpi)) {
+  if (!is_one_pass_svc(cpi)) {
     cpi->refresh_last_frame = 1;
     cpi->refresh_golden_frame = 0;
     cpi->refresh_alt_ref_frame = 0;
@@ -7780,7 +7794,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
       adjust_frame_rate(cpi, source);
   }
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     vp9_update_temporal_layer_framerate(cpi);
     vp9_restore_layer_context(cpi);
   }
@@ -7914,12 +7928,15 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
   }
 
   // Save layer specific state.
-  if (is_one_pass_cbr_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
-                                    cpi->svc.number_spatial_layers > 1) &&
-                                   oxcf->pass == 2)) {
+  if (is_one_pass_svc(cpi) || ((cpi->svc.number_temporal_layers > 1 ||
+                                cpi->svc.number_spatial_layers > 1) &&
+                               oxcf->pass == 2)) {
     vp9_save_layer_context(cpi);
   }
 
+  if (cpi->svc.spatial_layer_id == cpi->svc.number_spatial_layers - 1)
+    cpi->fixed_qp_onepass = 0;
+
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
 
@@ -7928,7 +7945,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #if CONFIG_INTERNAL_STATS
 
-  if (oxcf->pass != 1) {
+  if (oxcf->pass != 1 && !cpi->last_frame_dropped) {
     double samples = 0.0;
     cpi->bytes += (int)(*size);
 
@@ -8090,7 +8107,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
 
 #endif
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;
       if (cpi->svc.spatial_layer_to_encode >= cpi->svc.number_spatial_layers)
@@ -8159,9 +8176,11 @@ int vp9_set_size_literal(VP9_COMP *cpi, unsigned int width,
                          unsigned int height) {
   VP9_COMMON *cm = &cpi->common;
 #if CONFIG_VP9_HIGHBITDEPTH
-  update_initial_width(cpi, cm->use_highbitdepth, 1, 1);
+  update_initial_width(cpi, cm->use_highbitdepth, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #else
-  update_initial_width(cpi, 0, 1, 1);
+  update_initial_width(cpi, 0, cpi->common.subsampling_x,
+                       cpi->common.subsampling_y);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index 1d5894525..cca8b53f8 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h
@@ -971,6 +971,8 @@ typedef struct VP9_COMP {
   RATE_QSTEP_MODEL rq_model[ENCODE_FRAME_TYPES];
 #endif
   EXT_RATECTRL ext_ratectrl;
+
+  int fixed_qp_onepass;
 } VP9_COMP;
 
 #if CONFIG_RATE_CTRL
@@ -1305,7 +1307,7 @@ YV12_BUFFER_CONFIG *vp9_scale_if_required(
 
 void vp9_apply_encoding_flags(VP9_COMP *cpi, vpx_enc_frame_flags_t flags);
 
-static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
+static INLINE int is_one_pass_svc(const struct VP9_COMP *const cpi) {
   return (cpi->use_svc && cpi->oxcf.pass == 0);
 }
 
diff --git a/libvpx/vp9/encoder/vp9_ext_ratectrl.c b/libvpx/vp9/encoder/vp9_ext_ratectrl.c
index 9f0098ab5..1d440442b 100644
--- a/libvpx/vp9/encoder/vp9_ext_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ext_ratectrl.c
@@ -137,19 +137,21 @@ static int extrc_get_frame_type(FRAME_UPDATE_TYPE update_type) {
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision) {
   if (ext_ratectrl == NULL) {
     return VPX_CODEC_INVALID_PARAM;
   }
-  if (ext_ratectrl->ready) {
+  if (ext_ratectrl->ready && (ext_ratectrl->funcs.rc_type & VPX_RC_QP) != 0) {
     vpx_rc_status_t rc_status;
     vpx_rc_encodeframe_info_t encode_frame_info;
     encode_frame_info.show_index = show_index;
     encode_frame_info.coding_index = coding_index;
     encode_frame_info.gop_index = gop_index;
     encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+    encode_frame_info.gop_size = gop_size;
+    encode_frame_info.use_alt_ref = use_alt_ref;
 
     vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
                            encode_frame_info.ref_frame_coding_indexes,
@@ -198,3 +200,62 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
   }
   return VPX_CODEC_OK;
 }
+
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision) {
+  vpx_rc_status_t rc_status;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_GOP) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  rc_status = ext_ratectrl->funcs.get_gop_decision(ext_ratectrl->model,
+                                                   gop_info, gop_decision);
+  if (gop_decision->use_alt_ref) {
+    const int arf_constraint =
+        gop_decision->gop_coding_frames >= gop_info->min_gf_interval &&
+        gop_decision->gop_coding_frames < gop_info->lag_in_frames;
+    if (!arf_constraint || !gop_info->allow_alt_ref) return VPX_CODEC_ERROR;
+  }
+  // TODO(chengchen): Take min and max gf interval from the model
+  // and overwrite libvpx's decision so that we can get rid
+  // of one of the checks here.
+  if (gop_decision->gop_coding_frames > gop_info->frames_to_key ||
+      gop_decision->gop_coding_frames - gop_decision->use_alt_ref >
+          gop_info->max_gf_interval) {
+    return VPX_CODEC_ERROR;
+  }
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult) {
+  vpx_rc_status_t rc_status;
+  vpx_rc_encodeframe_info_t encode_frame_info;
+  if (ext_ratectrl == NULL || !ext_ratectrl->ready ||
+      (ext_ratectrl->funcs.rc_type & VPX_RC_RDMULT) == 0) {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+  encode_frame_info.show_index = show_index;
+  encode_frame_info.coding_index = coding_index;
+  encode_frame_info.gop_index = gop_index;
+  encode_frame_info.frame_type = extrc_get_frame_type(update_type);
+  encode_frame_info.gop_size = gop_size;
+  encode_frame_info.use_alt_ref = use_alt_ref;
+
+  vp9_get_ref_frame_info(update_type, ref_frame_flags, ref_frame_bufs,
+                         encode_frame_info.ref_frame_coding_indexes,
+                         encode_frame_info.ref_frame_valid_list);
+  rc_status = ext_ratectrl->funcs.get_frame_rdmult(ext_ratectrl->model,
+                                                   &encode_frame_info, rdmult);
+  if (rc_status == VPX_RC_ERROR) {
+    return VPX_CODEC_ERROR;
+  }
+  return VPX_CODEC_OK;
+}
diff --git a/libvpx/vp9/encoder/vp9_ext_ratectrl.h b/libvpx/vp9/encoder/vp9_ext_ratectrl.h
index 74fd68b96..7c3875883 100644
--- a/libvpx/vp9/encoder/vp9_ext_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ext_ratectrl.h
@@ -16,6 +16,7 @@
 
 typedef struct EXT_RATECTRL {
   int ready;
+  int ext_rdmult;
   vpx_rc_model_t model;
   vpx_rc_funcs_t funcs;
   vpx_rc_config_t ratectrl_config;
@@ -35,7 +36,7 @@ vpx_codec_err_t vp9_extrc_send_firstpass_stats(
 
 vpx_codec_err_t vp9_extrc_get_encodeframe_decision(
     EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
-    FRAME_UPDATE_TYPE update_type,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
     RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
     vpx_rc_encodeframe_decision_t *encode_frame_decision);
 
@@ -45,4 +46,14 @@ vpx_codec_err_t vp9_extrc_update_encodeframe_result(
     const YV12_BUFFER_CONFIG *coded_frame, uint32_t bit_depth,
     uint32_t input_bit_depth, const int actual_encoding_qindex);
 
+vpx_codec_err_t vp9_extrc_get_gop_decision(
+    EXT_RATECTRL *ext_ratectrl, const vpx_rc_gop_info_t *const gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
+vpx_codec_err_t vp9_extrc_get_frame_rdmult(
+    EXT_RATECTRL *ext_ratectrl, int show_index, int coding_index, int gop_index,
+    FRAME_UPDATE_TYPE update_type, int gop_size, int use_alt_ref,
+    RefCntBuffer *ref_frame_bufs[MAX_INTER_REF_FRAMES], int ref_frame_flags,
+    int *rdmult);
+
 #endif  // VPX_VP9_ENCODER_VP9_EXT_RATECTRL_H_
diff --git a/libvpx/vp9/encoder/vp9_firstpass.c b/libvpx/vp9/encoder/vp9_firstpass.c
index 67302ed03..e9250e25c 100644
--- a/libvpx/vp9/encoder/vp9_firstpass.c
+++ b/libvpx/vp9/encoder/vp9_firstpass.c
@@ -2113,11 +2113,10 @@ static int64_t calculate_total_gf_group_bits(VP9_COMP *cpi,
   }
 
   // Clamp odd edge cases.
-  total_group_bits = (total_group_bits < 0)
-                         ? 0
-                         : (total_group_bits > twopass->kf_group_bits)
-                               ? twopass->kf_group_bits
-                               : total_group_bits;
+  total_group_bits = (total_group_bits < 0) ? 0
+                     : (total_group_bits > twopass->kf_group_bits)
+                         ? twopass->kf_group_bits
+                         : total_group_bits;
 
   // Clip based on user supplied data rate variability limit.
   if (total_group_bits > (int64_t)max_bits * gop_frames)
@@ -2714,6 +2713,9 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
     vp9_zero(twopass->gf_group);
+    ++rc->gop_global_index;
+  } else {
+    rc->gop_global_index = 0;
   }
 
   vpx_clear_system_state();
@@ -2751,6 +2753,37 @@ static void define_gf_group(VP9_COMP *cpi, int gf_start_show_idx) {
     }
   }
 #endif
+  // If the external rate control model for GOP is used, the gop decisions
+  // are overwritten. Specifically, |gop_coding_frames| and |use_alt_ref|
+  // will be overwritten.
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_GOP) != 0) {
+    vpx_codec_err_t codec_status;
+    vpx_rc_gop_decision_t gop_decision;
+    vpx_rc_gop_info_t gop_info;
+    gop_info.min_gf_interval = rc->min_gf_interval;
+    gop_info.max_gf_interval = rc->max_gf_interval;
+    gop_info.active_min_gf_interval = active_gf_interval.min;
+    gop_info.active_max_gf_interval = active_gf_interval.max;
+    gop_info.allow_alt_ref = allow_alt_ref;
+    gop_info.is_key_frame = is_key_frame;
+    gop_info.last_gop_use_alt_ref = rc->source_alt_ref_active;
+    gop_info.frames_since_key = rc->frames_since_key;
+    gop_info.frames_to_key = rc->frames_to_key;
+    gop_info.lag_in_frames = cpi->oxcf.lag_in_frames;
+    gop_info.show_index = cm->current_video_frame;
+    gop_info.coding_index = cm->current_frame_coding_index;
+    gop_info.gop_global_index = rc->gop_global_index;
+
+    codec_status = vp9_extrc_get_gop_decision(&cpi->ext_ratectrl, &gop_info,
+                                              &gop_decision);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_get_gop_decision() failed");
+    }
+    gop_coding_frames = gop_decision.gop_coding_frames;
+    use_alt_ref = gop_decision.use_alt_ref;
+  }
 
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (gop_coding_frames >= rc->frames_to_key) ? 1 : 0;
@@ -3461,6 +3494,16 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) {
   FIRSTPASS_STATS this_frame;
   const int show_idx = cm->current_video_frame;
 
+  if (cpi->common.current_frame_coding_index == 0) {
+    VP9_COMMON *cm = &cpi->common;
+    const vpx_codec_err_t codec_status = vp9_extrc_send_firstpass_stats(
+        &cpi->ext_ratectrl, &cpi->twopass.first_pass_info);
+    if (codec_status != VPX_CODEC_OK) {
+      vpx_internal_error(&cm->error, codec_status,
+                         "vp9_extrc_send_firstpass_stats() failed");
+    }
+  }
+
   if (!twopass->stats_in) return;
 
   // Configure image size specific vizier parameters
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index 697c589ab..579b466ca 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -268,6 +268,7 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
                            uint32_t *sse8x8, int *sum8x8, uint32_t *var8x8) {
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   *sse = 0;
   *sum = 0;
@@ -305,7 +306,8 @@ static void block_variance(const uint8_t *src, int src_stride,
 #endif
       *sse += sse8x8[k];
       *sum += sum8x8[k];
-      var8x8[k] = sse8x8[k] - (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      k_sqr = (uint32_t)(((int64_t)sum8x8[k] * sum8x8[k]) >> 6);
+      var8x8[k] = sse8x8[k] > k_sqr ? sse8x8[k] - k_sqr : k_sqr - sse8x8[k];
       k++;
     }
   }
@@ -319,6 +321,7 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
   const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
   const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
   int i, j, k = 0;
+  uint32_t k_sqr = 0;
 
   for (i = 0; i < nh; i += 2) {
     for (j = 0; j < nw; j += 2) {
@@ -326,9 +329,10 @@ static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
                  sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
       sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
                  sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
-      var_o[k] = sse_o[k] - (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
-                                       (b_width_log2_lookup[unit_size] +
-                                        b_height_log2_lookup[unit_size] + 6));
+      k_sqr = (uint32_t)(((int64_t)sum_o[k] * sum_o[k]) >>
+                         (b_width_log2_lookup[unit_size] +
+                          b_height_log2_lookup[unit_size] + 6));
+      var_o[k] = sse_o[k] > k_sqr ? sse_o[k] - k_sqr : k_sqr - sse_o[k];
       k++;
     }
   }
@@ -452,6 +456,7 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
   unsigned int var8x8[64] = { 0 };
   TX_SIZE tx_size;
   int i, k;
+  uint32_t sum_sqr;
 #if CONFIG_VP9_HIGHBITDEPTH
   const vpx_bit_depth_t bd = cpi->common.bit_depth;
 #endif
@@ -463,7 +468,8 @@ static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
                  cpi->common.use_highbitdepth, bd,
 #endif
                  sse8x8, sum8x8, var8x8);
-  var = sse - (unsigned int)(((int64_t)sum * sum) >> (bw + bh + 4));
+  sum_sqr = (uint32_t)((int64_t)sum * sum) >> (bw + bh + 4);
+  var = sse > sum_sqr ? sse - sum_sqr : sum_sqr - sse;
 
   *var_y = var;
   *sse_y = sse;
@@ -1112,7 +1118,7 @@ static INLINE int rd_less_than_thresh_row_mt(int64_t best_rd, int thresh,
 }
 
 static INLINE void update_thresh_freq_fact_row_mt(
-    VP9_COMP *cpi, TileDataEnc *tile_data, int source_variance,
+    VP9_COMP *cpi, TileDataEnc *tile_data, unsigned int source_variance,
     int thresh_freq_fact_idx, MV_REFERENCE_FRAME ref_frame,
     THR_MODES best_mode_idx, PREDICTION_MODE mode) {
   THR_MODES thr_mode_idx = mode_idx[ref_frame][mode_offset(mode)];
@@ -1627,9 +1633,9 @@ static int search_new_mv(VP9_COMP *cpi, MACROBLOCK *x,
         return -1;
 
       // Exit NEWMV search if base_mv_sse is large.
-      if (sf->base_mv_aggressive && base_mv_sse > (best_sse_sofar << scale))
+      if (sf->base_mv_aggressive && (base_mv_sse >> scale) > best_sse_sofar)
         return -1;
-      if (base_mv_sse < (best_sse_sofar << 1)) {
+      if ((base_mv_sse >> 1) < best_sse_sofar) {
         // Base layer mv is good.
         // Exit NEWMV search if the base_mv is (0, 0) and sse is low, since
         // (0, 0) mode is already tested.
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index 9058997b0..dcc44449f 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -149,34 +149,6 @@ void vp9_highbd_quantize_fp_32x32_c(
 }
 #endif
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block),
-             *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  const int n_coeffs = 4 * 4;
-
-  if (x->skip_block) {
-    memset(qcoeff, 0, n_coeffs * sizeof(*qcoeff));
-    memset(dqcoeff, 0, n_coeffs * sizeof(*dqcoeff));
-    return;
-  }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
-                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                          pd->dequant, &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
-                 p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                 &p->eobs[block], scan, iscan);
-}
-
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
   int l, m;
diff --git a/libvpx/vp9/encoder/vp9_quantize.h b/libvpx/vp9/encoder/vp9_quantize.h
index 2e6d7da2b..f626f0656 100644
--- a/libvpx/vp9/encoder/vp9_quantize.h
+++ b/libvpx/vp9/encoder/vp9_quantize.h
@@ -37,9 +37,6 @@ typedef struct {
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct VP9_COMP;
 struct VP9Common;
 
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index 085297391..d9207f7a2 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -327,7 +327,7 @@ static void update_buffer_level_postencode(VP9_COMP *cpi,
 
   rc->buffer_level = rc->bits_off_target;
 
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     update_layer_buffer_level_postencode(&cpi->svc, encoded_frame_size);
   }
 }
@@ -910,7 +910,7 @@ static int calc_active_worst_quality_one_pass_vbr(const VP9_COMP *cpi) {
     active_worst_quality =
         curr_frame == 0 ? rc->worst_quality : rc->last_q[KEY_FRAME] << 1;
   } else {
-    if (!rc->is_src_frame_alt_ref &&
+    if (!rc->is_src_frame_alt_ref && !cpi->use_svc &&
         (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) {
       active_worst_quality =
           curr_frame == 1
@@ -1871,7 +1871,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
       }
     }
   } else {
-    if ((cpi->use_svc && oxcf->rc_mode == VPX_CBR) ||
+    if ((cpi->use_svc) ||
         (!rc->is_src_frame_alt_ref &&
          !(cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame))) {
       rc->last_q[INTER_FRAME] = qindex;
@@ -2021,6 +2021,11 @@ int vp9_calc_pframe_target_size_one_pass_vbr(const VP9_COMP *cpi) {
                 (rc->baseline_gf_interval + af_ratio - 1)
           : ((int64_t)rc->avg_frame_bandwidth * rc->baseline_gf_interval) /
                 (rc->baseline_gf_interval + af_ratio - 1);
+  // For SVC: refresh flags are used to define the pattern, so we can't
+  // use that for boosting the target size here.
+  // TODO(marpan): Consider adding internal boost on TL0 for VBR-SVC.
+  // For now just use the CBR logic for setting target size.
+  if (cpi->use_svc) target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
   if (target > INT_MAX) target = INT_MAX;
   return vp9_rc_clamp_pframe_target_size(cpi, (int)target);
 }
@@ -2147,7 +2152,7 @@ int vp9_calc_pframe_target_size_one_pass_cbr(const VP9_COMP *cpi) {
   } else {
     target = rc->avg_frame_bandwidth;
   }
-  if (is_one_pass_cbr_svc(cpi)) {
+  if (is_one_pass_svc(cpi)) {
     // Note that for layers, avg_frame_bandwidth is the cumulative
     // per-frame-bandwidth. For the target size of this frame, use the
     // layer average frame size (i.e., non-cumulative per-frame-bw).
@@ -2282,7 +2287,7 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       (svc->spatial_layer_sync[0] == 1 && svc->spatial_layer_id == 0)) {
     cm->frame_type = KEY_FRAME;
     rc->source_alt_ref_active = 0;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       if (cm->current_video_frame > 0) vp9_svc_reset_temporal_layers(cpi, 1);
       layer = LAYER_IDS_TO_IDX(svc->spatial_layer_id, svc->temporal_layer_id,
                                svc->number_temporal_layers);
@@ -2290,11 +2295,14 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
       // Assumption here is that LAST_FRAME is being updated for a keyframe.
       // Thus no change in update flags.
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
   } else {
     cm->frame_type = INTER_FRAME;
-    if (is_one_pass_cbr_svc(cpi)) {
+    if (is_one_pass_svc(cpi)) {
       LAYER_CONTEXT *lc = &svc->layer_context[layer];
       // Add condition current_video_frame > 0 for the case where first frame
       // is intra only followed by overlay/copy frame. In this case we don't
@@ -2303,7 +2311,23 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
           (svc->spatial_layer_id == 0 && cm->current_video_frame > 0)
               ? 0
               : svc->layer_context[svc->temporal_layer_id].is_key_frame;
-      target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR) {
+        target = vp9_calc_pframe_target_size_one_pass_cbr(cpi);
+      } else {
+        double rate_err = 0.0;
+        rc->fac_active_worst_inter = 140;
+        rc->fac_active_worst_gf = 100;
+        if (rc->rolling_target_bits > 0) {
+          rate_err =
+              (double)rc->rolling_actual_bits / (double)rc->rolling_target_bits;
+          if (rate_err < 1.0)
+            rc->fac_active_worst_inter = 120;
+          else if (rate_err > 2.0)
+            // Increase active_worst faster if rate fluctuation is high.
+            rc->fac_active_worst_inter = 160;
+        }
+        target = vp9_calc_pframe_target_size_one_pass_vbr(cpi);
+      }
     }
   }
 
@@ -2312,7 +2336,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
         svc->layer_context[layer].is_key_frame == 1) {
       cm->frame_type = KEY_FRAME;
       cpi->ref_frame_flags &= (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
-      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      if (cpi->oxcf.rc_mode == VPX_CBR)
+        target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+      else
+        target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
     }
     // Set the buffer idx and refresh flags for key frames in simulcast mode.
     // Note the buffer slot for long-term reference is set below (line 2255),
@@ -2397,7 +2424,10 @@ void vp9_rc_get_svc_params(VP9_COMP *cpi) {
   }
   if (svc->set_intra_only_frame) {
     set_intra_only_frame(cpi);
-    target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    if (cpi->oxcf.rc_mode == VPX_CBR)
+      target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
+    else
+      target = vp9_calc_iframe_target_size_one_pass_vbr(cpi);
   }
   // Overlay frame predicts from LAST (intra-only)
   if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
@@ -2584,13 +2614,12 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
       const uint32_t pic_breadth =
           VPXMAX(cpi->common.width, cpi->common.height);
       int i;
-      for (i = LEVEL_1; i < LEVEL_MAX; ++i) {
+      for (i = 0; i < VP9_LEVELS; ++i) {
         if (vp9_level_defs[i].max_luma_picture_size >= pic_size &&
             vp9_level_defs[i].max_luma_picture_breadth >= pic_breadth) {
           if (rc->min_gf_interval <=
               (int)vp9_level_defs[i].min_altref_distance) {
-            rc->min_gf_interval =
-                (int)vp9_level_defs[i].min_altref_distance + 1;
+            rc->min_gf_interval = (int)vp9_level_defs[i].min_altref_distance;
             rc->max_gf_interval =
                 VPXMAX(rc->max_gf_interval, rc->min_gf_interval);
           }
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.h b/libvpx/vp9/encoder/vp9_ratectrl.h
index 83a12cde7..96a8fd3f1 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.h
+++ b/libvpx/vp9/encoder/vp9_ratectrl.h
@@ -211,6 +211,10 @@ typedef struct {
   // Flag to constrain golden frame interval on key frame frequency for 1 pass
   // VBR.
   int constrain_gf_key_freq_onepass_vbr;
+
+  // The index of the current GOP. Start from zero.
+  // When a key frame is inserted, it resets to zero.
+  int gop_global_index;
 } RATE_CONTROL;
 
 struct VP9_COMP;
diff --git a/libvpx/vp9/encoder/vp9_rd.c b/libvpx/vp9/encoder/vp9_rd.c
index 9fa3ff186..58dd75b44 100644
--- a/libvpx/vp9/encoder/vp9_rd.c
+++ b/libvpx/vp9/encoder/vp9_rd.c
@@ -244,6 +244,12 @@ int vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
   // largest dc_quant is 21387, therefore rdmult should fit in int32_t
   int rdmult = q * q;
 
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
+
   // Make sure this function is floating point safe.
   vpx_clear_system_state();
 
@@ -287,6 +293,11 @@ static int modulate_rdmult(const VP9_COMP *cpi, int rdmult) {
 
 int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) {
   int rdmult = vp9_compute_rd_mult_based_on_qindex(cpi, qindex);
+  if (cpi->ext_ratectrl.ready &&
+      (cpi->ext_ratectrl.funcs.rc_type & VPX_RC_RDMULT) != 0 &&
+      cpi->ext_ratectrl.ext_rdmult != VPX_DEFAULT_RDMULT) {
+    return cpi->ext_ratectrl.ext_rdmult;
+  }
   return modulate_rdmult(cpi, rdmult);
 }
 
@@ -567,6 +578,12 @@ void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE],
   }
 }
 
+// Disable gcc 12.2 false positive warning.
+// warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=]
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstringop-overflow"
+#endif
 void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
                               ENTROPY_CONTEXT t_above[16],
@@ -604,6 +621,9 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
       break;
   }
 }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 
 void vp9_mv_pred(VP9_COMP *cpi, MACROBLOCK *x, uint8_t *ref_y_buffer,
                  int ref_y_stride, int ref_frame, BLOCK_SIZE block_size) {
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index 0171a0572..a464ce38f 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -1108,6 +1108,8 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
 
   xd->mi[0]->tx_size = TX_4X4;
 
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
@@ -1135,7 +1137,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           uint16_t *const dst16 = CONVERT_TO_SHORTPTR(dst);
           int16_t *const src_diff =
               vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+          tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+          tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+          tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+          uint16_t *const eob = &p->eobs[block];
           xd->mi[0]->bmi[block].as_mode = mode;
           vp9_predict_intra_block(xd, 1, TX_4X4, mode,
                                   x->skip_encode ? src : dst,
@@ -1148,7 +1153,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
             const int coeff_ctx =
                 combine_entropy_contexts(tempa[idx], templ[idy]);
             vp9_highbd_fwht4x4(src_diff, coeff, 8);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0 ? 1 : 0);
@@ -1166,7 +1173,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
               vpx_highbd_fdct4x4(src_diff, coeff, 8);
             else
               vp9_highbd_fht4x4(src_diff, coeff, 8, tx_type);
-            vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+            vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                                  p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                  eob, so->scan, so->iscan);
             ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                  so->neighbors, cpi->sf.use_fast_coef_costing);
             distortion += vp9_highbd_block_error_dispatch(
@@ -1236,7 +1245,10 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp9_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
         xd->mi[0]->bmi[block].as_mode = mode;
         vp9_predict_intra_block(xd, 1, TX_4X4, mode, x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride, dst,
@@ -1248,7 +1260,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fwht4x4(src_diff, coeff, 8);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1263,7 +1277,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           const int coeff_ctx =
               combine_entropy_contexts(tempa[idx], templ[idy]);
           vp9_fht4x4(src_diff, coeff, 8, tx_type);
-          vp9_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
+          vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         so->scan, so->iscan);
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
@@ -1640,6 +1656,8 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
   const int is_compound = has_second_ref(mi);
   const InterpKernel *kernel = vp9_filter_kernels[mi->interp_filter];
 
+  assert(!x->skip_block);
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const int bw = b_width_log2_lookup[BLOCK_8X8];
     const int h = 4 * (i >> bw);
@@ -1701,18 +1719,27 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, MACROBLOCK *x,
       const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
 #endif
       int64_t ssz, rd, rd1, rd2;
-      tran_low_t *coeff;
+      tran_low_t *coeff, *qcoeff, *dqcoeff;
+      uint16_t *eob;
       int coeff_ctx;
       k += (idy * 2 + idx);
       coeff_ctx = combine_entropy_contexts(ta[k & 1], tl[k >> 1]);
       coeff = BLOCK_OFFSET(p->coeff, k);
+      qcoeff = BLOCK_OFFSET(p->qcoeff, k);
+      dqcoeff = BLOCK_OFFSET(pd->dqcoeff, k);
+      eob = &p->eobs[k];
+
       x->fwd_txfm4x4(vp9_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
                      coeff, 8);
-      vp9_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
 #if CONFIG_VP9_HIGHBITDEPTH
+      vpx_highbd_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant,
+                            p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                            so->scan, so->iscan);
       thisdistortion += vp9_highbd_block_error_dispatch(
           coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz, bd);
 #else
+      vpx_quantize_b(coeff, 4 * 4, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, so->scan, so->iscan);
       thisdistortion +=
           vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k), 16, &ssz);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -3242,6 +3269,7 @@ int vp9_active_h_edge(VP9_COMP *cpi, int mi_row, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -3269,6 +3297,7 @@ int vp9_active_v_edge(VP9_COMP *cpi, int mi_col, int mi_step) {
   // For two pass account for any formatting bars detected.
   if (cpi->oxcf.pass == 2) {
     TWO_PASS *twopass = &cpi->twopass;
+    vpx_clear_system_state();
 
     // The inactive region is specified in MBs not mi units.
     // The image edge is in the following MB row.
@@ -3470,7 +3499,7 @@ void vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, TileDataEnc *tile_data,
   }
 
   mode_skip_mask[INTRA_FRAME] |=
-      ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
+      (uint16_t) ~(sf->intra_y_mode_mask[max_txsize_lookup[bsize]]);
 
   for (i = 0; i <= LAST_NEW_MV_INDEX; ++i) mode_threshold[i] = 0;
 
diff --git a/libvpx/vp9/encoder/vp9_segmentation.c b/libvpx/vp9/encoder/vp9_segmentation.c
index a163297e6..d75488a8e 100644
--- a/libvpx/vp9/encoder/vp9_segmentation.c
+++ b/libvpx/vp9/encoder/vp9_segmentation.c
@@ -39,7 +39,7 @@ void vp9_set_segment_data(struct segmentation *seg, signed char *feature_data,
 }
 void vp9_disable_segfeature(struct segmentation *seg, int segment_id,
                             SEG_LVL_FEATURES feature_id) {
-  seg->feature_mask[segment_id] &= ~(1 << feature_id);
+  seg->feature_mask[segment_id] &= ~(1u << feature_id);
 }
 
 void vp9_clear_segdata(struct segmentation *seg, int segment_id,
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index a57a70ab1..7e9435fb5 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -290,7 +290,7 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
 }
 
 static LAYER_CONTEXT *get_layer_context(VP9_COMP *const cpi) {
-  if (is_one_pass_cbr_svc(cpi))
+  if (is_one_pass_svc(cpi))
     return &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
                                        cpi->svc.number_temporal_layers +
                                    cpi->svc.temporal_layer_id];
@@ -354,7 +354,7 @@ void vp9_restore_layer_context(VP9_COMP *const cpi) {
   cpi->alt_ref_source = lc->alt_ref_source;
   // Check if it is one_pass_cbr_svc mode and lc->speed > 0 (real-time mode
   // does not use speed = 0).
-  if (is_one_pass_cbr_svc(cpi) && lc->speed > 0) {
+  if (is_one_pass_svc(cpi) && lc->speed > 0) {
     cpi->oxcf.speed = lc->speed;
   }
   cpi->loopfilter_ctrl = lc->loopfilter_ctrl;
@@ -754,7 +754,7 @@ void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
-int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
+int vp9_one_pass_svc_start_layer(VP9_COMP *const cpi) {
   int width = 0, height = 0;
   SVC *const svc = &cpi->svc;
   LAYER_CONTEXT *lc = NULL;
@@ -894,6 +894,10 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
     RATE_CONTROL *const lrc = &lc->rc;
     lrc->worst_quality = vp9_quantizer_to_qindex(lc->max_q);
     lrc->best_quality = vp9_quantizer_to_qindex(lc->min_q);
+    if (cpi->fixed_qp_onepass) {
+      lrc->worst_quality = cpi->rc.worst_quality;
+      lrc->best_quality = cpi->rc.best_quality;
+    }
   }
 
   if (cpi->oxcf.resize_mode == RESIZE_DYNAMIC && svc->single_layer_svc == 1 &&
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.h b/libvpx/vp9/encoder/vp9_svc_layercontext.h
index b2d1d1b98..c7328cf57 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.h
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.h
@@ -255,7 +255,7 @@ int vp9_denoise_svc_non_key(struct VP9_COMP *const cpi);
 
 void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
 
-int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
+int vp9_one_pass_svc_start_layer(struct VP9_COMP *const cpi);
 
 void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
 
diff --git a/libvpx/vp9/encoder/vp9_temporal_filter.c b/libvpx/vp9/encoder/vp9_temporal_filter.c
index 701bb8928..8af30c42a 100644
--- a/libvpx/vp9/encoder/vp9_temporal_filter.c
+++ b/libvpx/vp9/encoder/vp9_temporal_filter.c
@@ -777,16 +777,16 @@ void vp9_temporal_filter_iterate_row_c(VP9_COMP *cpi, ThreadData *td,
           // Assign higher weight to matching MB if it's error
           // score is lower. If not applying MC default behavior
           // is to weight all MBs equal.
-          blk_fw[0] = err < (thresh_low << THR_SHIFT)
-                          ? 2
-                          : err < (thresh_high << THR_SHIFT) ? 1 : 0;
+          blk_fw[0] = err < (thresh_low << THR_SHIFT)    ? 2
+                      : err < (thresh_high << THR_SHIFT) ? 1
+                                                         : 0;
           blk_fw[1] = blk_fw[2] = blk_fw[3] = blk_fw[0];
         } else {
           use_32x32 = 0;
           for (k = 0; k < 4; k++)
-            blk_fw[k] = blk_bestsme[k] < thresh_low
-                            ? 2
-                            : blk_bestsme[k] < thresh_high ? 1 : 0;
+            blk_fw[k] = blk_bestsme[k] < thresh_low    ? 2
+                        : blk_bestsme[k] < thresh_high ? 1
+                                                       : 0;
         }
 
         for (k = 0; k < 4; k++) {
diff --git a/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
index 4fa24512c..a7f5117cf 100644
--- a/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
+++ b/libvpx/vp9/encoder/x86/highbd_temporal_filter_sse4.c
@@ -191,13 +191,11 @@ static INLINE void highbd_read_chroma_dist_row_8(
 }
 
 static void vp9_highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint32_t *const *neighbors_first,
     const uint32_t *const *neighbors_second, int top_weight,
     int bottom_weight) {
   const int rounding = (1 << strength) >> 1;
@@ -256,17 +254,12 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
   highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                 y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -300,11 +293,7 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
       highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
                                     &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -320,7 +309,6 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
     highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
                                   y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -364,13 +352,10 @@ static void vp9_highbd_apply_temporal_filter_luma_8(
 
 // Perform temporal filter for the luma component.
 static void vp9_highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
+    const uint16_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -384,9 +369,7 @@ static void vp9_highbd_apply_temporal_filter_luma(
   neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -399,13 +382,10 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   if (!use_whole_blk) {
@@ -417,21 +397,16 @@ static void vp9_highbd_apply_temporal_filter_luma(
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
+        y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+        strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
+        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
+        neighbors_first, neighbors_second, top_weight, bottom_weight);
   }
 
   // Right
   neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
+      y_pre + blk_col, y_pre_stride, blk_col_step, block_height, ss_x, ss_y,
       strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
       y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
       neighbors_first, neighbors_second, top_weight, bottom_weight);
@@ -491,13 +466,11 @@ static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int uv_block_width, unsigned int uv_block_height, int ss_x,
+    int ss_y, int strength, uint32_t *u_accum, uint16_t *u_count,
+    uint32_t *v_accum, uint16_t *v_count, const uint32_t *y_dist,
+    const uint32_t *u_dist, const uint32_t *v_dist,
     const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
     int top_weight, int bottom_weight, const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -565,10 +538,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                 v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -576,8 +547,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -649,10 +618,8 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
                                   v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -660,8 +627,6 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -720,12 +685,10 @@ static void vp9_highbd_apply_temporal_filter_chroma_8(
 
 // Perform temporal filter for the chroma components.
 static void vp9_highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint16_t *u_pre, const uint16_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -755,8 +718,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
 
     if (use_whole_blk) {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -764,8 +725,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
           neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
     } else {
       vp9_highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
           u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
           uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
           u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -789,13 +748,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 
   blk_col += blk_col_step;
   uv_blk_col += uv_blk_col_step;
@@ -812,8 +769,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -830,8 +785,6 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
         u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
         uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
         u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
@@ -849,13 +802,11 @@ static void vp9_highbd_apply_temporal_filter_chroma(
   }
 
   vp9_highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
+      uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
+      u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
+      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst,
+      neighbors_snd, top_weight, bottom_weight, NULL);
 }
 
 void vp9_highbd_apply_temporal_filter_sse4_1(
@@ -929,14 +880,12 @@ void vp9_highbd_apply_temporal_filter_sse4_1(
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_highbd_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw, use_whole_blk, y_accum, y_count, y_dist_ptr, u_dist_ptr,
-      v_dist_ptr);
+  vp9_highbd_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width,
+                                        block_height, ss_x, ss_y, strength,
+                                        blk_fw, use_whole_blk, y_accum, y_count,
+                                        y_dist_ptr, u_dist_ptr, v_dist_ptr);
 
   vp9_highbd_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);
diff --git a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index 2188903b1..e9943447f 100644
--- a/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -111,7 +111,7 @@ static void fadst4_sse2(__m128i *in) {
   const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
   const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
   const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
@@ -424,7 +424,7 @@ static void fadst8_sse2(__m128i *in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i k__const_0 = _mm_set1_epi16(0);
+  const __m128i k__const_0 = _mm_setzero_si128();
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
 
   __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
@@ -1056,7 +1056,7 @@ static void fadst16_8col(__m128i *in) {
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index fcf50eb2a..0e04a2f41 100644
--- a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -76,9 +76,9 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                                int *num00, const vp9_variance_fn_ptr_t *fn_ptr,
                                const MV *center_mv) {
   const int_mv maxmv = pack_int_mv(x->mv_limits.row_max, x->mv_limits.col_max);
-  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
+  const __m128i v_max_mv_w = _mm_set1_epi32((int)maxmv.as_int);
   const int_mv minmv = pack_int_mv(x->mv_limits.row_min, x->mv_limits.col_min);
-  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);
+  const __m128i v_min_mv_w = _mm_set1_epi32((int)minmv.as_int);
 
   const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);
 
@@ -96,14 +96,14 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
 
   const int_mv fcenter_mv =
       pack_int_mv(center_mv->row >> 3, center_mv->col >> 3);
-  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);
+  const __m128i vfcmv = _mm_set1_epi32((int)fcenter_mv.as_int);
 
   const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
   const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);
 
   int_mv bmv = pack_int_mv(ref_row, ref_col);
   int_mv new_bmv = bmv;
-  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);
+  __m128i v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
 
   const int what_stride = x->plane[0].src.stride;
   const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
@@ -300,7 +300,7 @@ int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
     bmv = new_bmv;
     best_address = new_best_address;
 
-    v_bmv_w = _mm_set1_epi32(bmv.as_int);
+    v_bmv_w = _mm_set1_epi32((int)bmv.as_int);
 #if VPX_ARCH_X86_64
     v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
 #else
diff --git a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 7685e7bc3..bf0e8b121 100644
--- a/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/libvpx/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -754,8 +754,8 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
   const int src_h = src->y_crop_height;
   const int dst_w = dst->y_crop_width;
   const int dst_h = dst->y_crop_height;
-  const int dst_uv_w = dst_w / 2;
-  const int dst_uv_h = dst_h / 2;
+  const int dst_uv_w = dst->uv_crop_width;
+  const int dst_uv_h = dst->uv_crop_height;
   int scaled = 0;
 
   // phase_scaler is usually 0 or 8.
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
index db18b1a7a..da285be8e 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -18,7 +18,7 @@
 #include "vpx_dsp/x86/quantize_sse2.h"
 
 // Zero fill 8 positions in the output buffer.
-static INLINE void store_zero_tran_low(tran_low_t *a) {
+static VPX_FORCE_INLINE void store_zero_tran_low(tran_low_t *a) {
   const __m256i zero = _mm256_setzero_si256();
 #if CONFIG_VP9_HIGHBITDEPTH
   _mm256_storeu_si256((__m256i *)(a), zero);
@@ -28,22 +28,73 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
 #endif
 }
 
-static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
-                                   __m256i *coeff256) {
-  const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
-  const __m256i zero256 = _mm256_setzero_si256();
+static VPX_FORCE_INLINE void load_fp_values_avx2(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
 #if CONFIG_VP9_HIGHBITDEPTH
-  // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
-  // B1 A1 B0 A0.  Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
-  const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
 #else
-  const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
 #endif
-  const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
-  // Add one to convert from indices to counts
-  const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
-  return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob256) {
+  const __m256i eob_lo = eob256;
+  // Copy upper 128 to lower 128
+  const __m256i eob_hi = _mm256_permute2x128_si256(eob256, eob256, 0X81);
+  __m256i eob = _mm256_max_epi16(eob_lo, eob_hi);
+  __m256i eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize_fp_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const int32_t nzflag =
+      _mm256_movemask_epi8(_mm256_cmpgt_epi16(abs_coeff, *thr));
+
+  if (nzflag) {
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i dqcoeff = _mm256_mullo_epi16(qcoeff, *dequant);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -51,10 +102,114 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  __m128i eob;
-  __m256i round256, quant256, dequant256;
-  __m256i eob256, thr256;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_setzero_si256();
+
+  quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                 iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                 dqcoeff_ptr + n_coeffs, &eob_max);
 
+  n_coeffs += 8 * 2;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_srai_epi16(dequant, 1);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    quantize_fp_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                   iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                   dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += 8 * 2;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+// Enable this flag when matching the optimized code to
+// vp9_quantize_fp_32x32_c(). Disabled, the optimized code will match the
+// existing ssse3 code and quantize_fp_32x32_nz_c().
+//
+// #define MATCH_VP9_QUANTIZE_FP_32X32_C
+
+#ifndef MATCH_VP9_QUANTIZE_FP_32X32_C
+static VPX_FORCE_INLINE void quantize_fp_32x32_16_no_nzflag(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+  const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+  const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+  const __m256i abs_dqcoeff =
+      _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+  const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+  const __m256i nz_mask =
+      _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+  store_tran_low(qcoeff, qcoeff_ptr);
+  store_tran_low(dqcoeff, dqcoeff_ptr);
+
+  *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  (void)thr;
+}
+#endif
+
+static VPX_FORCE_INLINE void quantize_fp_32x32_16(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob_max) {
+  const __m256i coeff = load_tran_low(coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi16(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi16(abs_coeff, *thr);
+  const int32_t nzflag = _mm256_movemask_epi8(thr_mask);
+
+  if (nzflag) {
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(abs_coeff, *round), thr_mask);
+#else
+    const __m256i tmp_rnd = _mm256_adds_epi16(abs_coeff, *round);
+#endif
+    const __m256i abs_qcoeff = _mm256_mulhi_epi16(tmp_rnd, *quant);
+    const __m256i qcoeff = _mm256_sign_epi16(abs_qcoeff, coeff);
+    const __m256i abs_dqcoeff =
+        _mm256_srli_epi16(_mm256_mullo_epi16(abs_qcoeff, *dequant), 1);
+    const __m256i dqcoeff = _mm256_sign_epi16(abs_dqcoeff, coeff);
+    const __m256i nz_mask =
+        _mm256_cmpgt_epi16(abs_qcoeff, _mm256_setzero_si256());
+    store_tran_low(qcoeff, qcoeff_ptr);
+    store_tran_low(dqcoeff, dqcoeff_ptr);
+
+    *eob_max = get_max_lane_eob(iscan_ptr, *eob_max, nz_mask);
+  } else {
+    store_zero_tran_low(qcoeff_ptr);
+    store_zero_tran_low(dqcoeff_ptr);
+  }
+}
+
+void vp9_quantize_fp_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
   (void)scan;
 
   coeff_ptr += n_coeffs;
@@ -63,74 +218,223 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
 
+  // Setup global values
+  load_fp_values_avx2(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                      &dequant);
+  thr = _mm256_srli_epi16(dequant, 2);
+  quant = _mm256_slli_epi16(quant, 1);
   {
-    __m256i coeff256;
-
-    // Setup global values
-    {
-      const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
-      const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
-      const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-      round256 = _mm256_castsi128_si256(round);
-      round256 = _mm256_permute4x64_epi64(round256, 0x54);
-
-      quant256 = _mm256_castsi128_si256(quant);
-      quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
-
-      dequant256 = _mm256_castsi128_si256(dequant);
-      dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
-    }
-
-    {
-      __m256i qcoeff256;
-      __m256i qtmp256;
-      coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-      qcoeff256 = _mm256_abs_epi16(coeff256);
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-    }
-
-    eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256);
-    n_coeffs += 8 * 2;
+    const __m256i rnd = _mm256_set1_epi16((int16_t)1);
+    round = _mm256_add_epi16(round, rnd);
+    round = _mm256_srai_epi16(round, 1);
   }
 
-  // remove dc constants
-  dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
-  quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
-  round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+#ifdef MATCH_VP9_QUANTIZE_FP_32X32_C
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi16(thr, _mm256_set1_epi16(1));
+  quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+#else
+  quantize_fp_32x32_16_no_nzflag(
+      &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+      qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+#endif
+
+  n_coeffs += 8 * 2;
 
-  thr256 = _mm256_srai_epi16(dequant256, 1);
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
 
   // AC only loop
   while (n_coeffs < 0) {
-    __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
-    __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
-    int32_t nzflag =
-        _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
-
-    if (nzflag) {
-      __m256i qtmp256;
-      qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
-      qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
-      qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
-      store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
-      coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
-      store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
-      eob256 = _mm256_max_epi16(
-          eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256));
-    } else {
-      store_zero_tran_low(qcoeff_ptr + n_coeffs);
-      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-    }
+    quantize_fp_32x32_16(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                         iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                         dqcoeff_ptr + n_coeffs, &eob_max);
     n_coeffs += 8 * 2;
   }
 
-  eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
-                      _mm256_extracti128_si256(eob256, 1));
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_init_256(const int16_t *val_ptr) {
+  const __m128i v = _mm_load_si128((const __m128i *)val_ptr);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i dc = _mm_unpacklo_epi16(v, zero);
+  const __m128i ac = _mm_unpackhi_epi16(v, zero);
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void highbd_load_fp_values(
+    const int16_t *round_ptr, __m256i *round, const int16_t *quant_ptr,
+    __m256i *quant, const int16_t *dequant_ptr, __m256i *dequant) {
+  *round = highbd_init_256(round_ptr);
+  *quant = highbd_init_256(quant_ptr);
+  *dequant = highbd_init_256(dequant_ptr);
+}
+
+static VPX_FORCE_INLINE __m256i highbd_get_max_lane_eob(
+    const int16_t *iscan_ptr, __m256i eobmax, __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i tmp_rnd = _mm256_add_epi32(abs_coeff, *round);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, *dequant);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+
+  highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                     iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                     dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp(&round, &quant, &dequant, coeff_ptr + n_coeffs,
+                       iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                       dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
+
+  *eob_ptr = get_max_eob(eob_max);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_fp_32x32(
+    const __m256i *round, const __m256i *quant, const __m256i *dequant,
+    const __m256i *thr, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i thr_mask = _mm256_cmpgt_epi32(abs_coeff, *thr);
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, *round), thr_mask);
+  const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp_rnd, quant, 0);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, *dequant), 1);
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+
+  _mm256_storeu_si256((__m256i *)qcoeff_ptr, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff_ptr, dq);
+
+  *eob = highbd_get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void vp9_highbd_quantize_fp_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
+  const int step = 8;
+  __m256i round, quant, dequant, thr;
+  __m256i eob_max = _mm256_setzero_si256();
+  (void)scan;
+
+  coeff_ptr += n_coeffs;
+  iscan += n_coeffs;
+  qcoeff_ptr += n_coeffs;
+  dqcoeff_ptr += n_coeffs;
+  n_coeffs = -n_coeffs;
+
+  // Setup global values
+  highbd_load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr,
+                        &dequant);
+  thr = _mm256_srli_epi32(dequant, 2);
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  thr = _mm256_sub_epi32(thr, _mm256_set1_epi32(1));
+  quant = _mm256_slli_epi32(quant, 1);
+  round = _mm256_srai_epi32(_mm256_add_epi32(round, _mm256_set1_epi32(1)), 1);
+
+  highbd_quantize_fp_32x32(&round, &quant, &dequant, &thr, coeff_ptr + n_coeffs,
+                           iscan + n_coeffs, qcoeff_ptr + n_coeffs,
+                           dqcoeff_ptr + n_coeffs, &eob_max);
+
+  n_coeffs += step;
+
+  // remove dc constants
+  dequant = _mm256_permute2x128_si256(dequant, dequant, 0x31);
+  quant = _mm256_permute2x128_si256(quant, quant, 0x31);
+  round = _mm256_permute2x128_si256(round, round, 0x31);
+  thr = _mm256_permute2x128_si256(thr, thr, 0x31);
+
+  // AC only loop
+  while (n_coeffs < 0) {
+    highbd_quantize_fp_32x32(
+        &round, &quant, &dequant, &thr, coeff_ptr + n_coeffs, iscan + n_coeffs,
+        qcoeff_ptr + n_coeffs, dqcoeff_ptr + n_coeffs, &eob_max);
+    n_coeffs += step;
+  }
 
-  *eob_ptr = accumulate_eob(eob);
+  *eob_ptr = get_max_eob(eob_max);
 }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index 4bcadaa6a..c87723443 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -16,184 +16,110 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
-  __m128i eob;
+  int index = 16;
   __m128i round, quant, dequant;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
 
   (void)scan;
 
-  coeff_ptr += n_coeffs;
-  iscan += n_coeffs;
-  qcoeff_ptr += n_coeffs;
-  dqcoeff_ptr += n_coeffs;
-  n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
-
-  {
-    __m128i coeff0, coeff1;
-
-    // Setup global values
-    {
-      round = _mm_load_si128((const __m128i *)round_ptr);
-      quant = _mm_load_si128((const __m128i *)quant_ptr);
-      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-    }
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
 
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-      // Do DC and first 15 AC
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
 
-      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-      round = _mm_unpackhi_epi64(round, round);
-      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-      quant = _mm_unpackhi_epi64(quant, quant);
-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-      // Reinsert signs
-      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
 
-      store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-      store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
 
-      coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-      dequant = _mm_unpackhi_epi64(dequant, dequant);
-      coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-      store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-      store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-    }
+  // Reinsert signs.
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
-    {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-      eob = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob = _mm_max_epi16(eob, eob1);
-    }
-    n_coeffs += 8 * 2;
-  }
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   thr = _mm_srai_epi16(dequant, 1);
 
-  // AC only loop
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-      if (nzflag) {
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      } else {
-        store_zero_tran_low(qcoeff_ptr + n_coeffs);
-        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      }
-    }
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
     if (nzflag) {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob0, eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      // Add one to convert from indices to counts
-      iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
-      iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
-      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob0 = _mm_max_epi16(eob0, eob1);
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
       eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
     }
-    n_coeffs += 8 * 2;
-  }
 
-  // Accumulate EOB
-  {
-    __m128i eob_shuffled;
-    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    *eob_ptr = _mm_extract_epi16(eob, 1);
+    index += 16;
   }
+
+  *eob_ptr = accumulate_eob(eob);
 }
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
new file mode 100644
index 000000000..d35004e37
--- /dev/null
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
+#include "vpx_dsp/x86/quantize_ssse3.h"
+
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                           const int16_t *round_ptr, const int16_t *quant_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                           const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      __m128i eob0;
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+
+      eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                 const int16_t *round_ptr,
+                                 const int16_t *quant_ptr,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                 const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one_s16 = _mm_set1_epi16(1);
+  __m128i thr;
+  int nzflag;
+  int index = 16;
+  __m128i round, quant, dequant;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i eob;
+
+  (void)scan;
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
+  // The 32x32 halves round.
+  round = _mm_add_epi16(round, one_s16);
+  round = _mm_srli_epi16(round, 1);
+
+  // The 16x16 shifts by 16, the 32x32 shifts by 15. We want to use pmulhw so
+  // upshift quant to account for this.
+  quant = _mm_slli_epi16(quant, 1);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+  qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+
+  qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+  qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, qcoeff_ptr);
+  store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+  // Get the abs value of qcoeff again so we can use shifts for division.
+  qcoeff0 = _mm_abs_epi16(qcoeff0);
+  qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+  qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+  // Divide by 2.
+  qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+  qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+  // Reinsert signs.
+  qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+  qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+  store_tran_low(qcoeff0, dqcoeff_ptr);
+  store_tran_low(qcoeff1, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
+
+  thr = _mm_srai_epi16(dequant, 2);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
+
+    if (nzflag) {
+      qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+      qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, qcoeff_ptr + index);
+      store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+      // Get the abs value of qcoeff again so we can use shifts for division.
+      qcoeff0 = _mm_abs_epi16(qcoeff0);
+      qcoeff1 = _mm_abs_epi16(qcoeff1);
+
+      qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+      qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+      // Divide by 2.
+      qcoeff0 = _mm_srli_epi16(qcoeff0, 1);
+      qcoeff1 = _mm_srli_epi16(qcoeff1, 1);
+
+      // Reinsert signs.
+      qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+      qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+      store_tran_low(qcoeff0, dqcoeff_ptr + index);
+      store_tran_low(qcoeff1, dqcoeff_ptr + index + 8);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + index);
+      store_zero_tran_low(qcoeff_ptr + index + 8);
+
+      store_zero_tran_low(dqcoeff_ptr + index);
+      store_zero_tran_low(dqcoeff_ptr + index + 8);
+    }
+
+    if (nzflag) {
+      const __m128i eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
+      eob = _mm_max_epi16(eob, eob0);
+    }
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
deleted file mode 100644
index 680acfec6..000000000
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,178 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
-                                qcoeff, dqcoeff, dequant, \
-                                eob, scan, iscan
-
-  ; actual quantize loop - setup pointers, rounders, etc.
-  movifnidn                   coeffq, coeffmp
-  movifnidn                  ncoeffq, ncoeffmp
-  movifnidn                   roundq, roundmp
-  movifnidn                   quantq, quantmp
-  mova                            m1, [roundq]             ; m1 = round
-  mova                            m2, [quantq]             ; m2 = quant
-  mov                             r2, dequantmp
-%ifidn %1, fp_32x32
-  pcmpeqw                         m5, m5
-  psrlw                           m5, 15
-  paddw                           m1, m5
-  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
-%endif
-  mova                            m3, [r2q]                ; m3 = dequant
-  mov                             r3, qcoeffmp
-  mov                             r4, dqcoeffmp
-  mov                             r5, iscanmp
-%ifidn %1, fp_32x32
-  psllw                           m2, 1
-%endif
-  pxor                            m5, m5                   ; m5 = dedicated zero
-
-  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
-  lea                            r5q, [r5q+ncoeffq*2]
-  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
-  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
-  neg                        ncoeffq
-
-  ; get DC and first 15 AC coeffs
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  punpckhqdq                      m1, m1
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
-  punpckhqdq                      m2, m2
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                          m8, m9                   ; m8 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                           m8, m8
-  pabsw                          m13, m13
-%endif
-  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
-  punpckhqdq                      m3, m3
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                           m8, 1
-  psrlw                          m13, 1
-  psignw                          m8, m9
-  psignw                         m13, m10
-  psrlw                           m0, m3, 2
-%else
-  psrlw                           m0, m3, 1
-%endif
-  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                           m8, m6                   ; m8 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jz .accumulate_eob
-
-.ac_only_loop:
-  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
-  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
-  pabsw                           m6, m9                   ; m6 = abs(m9)
-  pabsw                          m11, m10                  ; m11 = abs(m10)
-
-  pcmpgtw                         m7, m6,  m0
-  pcmpgtw                        m12, m11, m0
-  pmovmskb                       r6d, m7
-  pmovmskb                       r2d, m12
-
-  or                              r6, r2
-  jz .skip_iter
-
-  pcmpeqw                         m7, m7
-
-  paddsw                          m6, m1                   ; m6 += round
-  paddsw                         m11, m1                   ; m11 += round
-  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
-  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
-  psignw                         m14, m9                   ; m14 = reinsert sign
-  psignw                         m13, m10                  ; m13 = reinsert sign
-  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
-%ifidn %1, fp_32x32
-  pabsw                          m14, m14
-  pabsw                          m13, m13
-%endif
-  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
-  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
-  psrlw                          m14, 1
-  psrlw                          m13, 1
-  psignw                         m14, m9
-  psignw                         m13, m10
-%endif
-  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
-  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
-  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
-  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
-  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
-  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
-  psubw                           m6, m7                   ; m6 = scan[i] + 1
-  psubw                          m11, m7                   ; m11 = scan[i] + 1
-  pandn                          m14, m6                   ; m14 = max(eob)
-  pandn                          m13, m11                  ; m13 = max(eob)
-  pmaxsw                          m8, m14
-  pmaxsw                          m8, m13
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-  jmp .accumulate_eob
-.skip_iter:
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
-  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
-  add                        ncoeffq, mmsize
-  jl .ac_only_loop
-
-.accumulate_eob:
-  ; horizontally accumulate/max eobs and write into [eob] memory pointer
-  mov                             r2, eobmp
-  pshufd                          m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0xe
-  pmaxsw                          m8, m7
-  pshuflw                         m7, m8, 0x1
-  pmaxsw                          m8, m7
-  pextrw                          r6, m8, 0
-  mov                           [r2], r6w
-  RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/libvpx/vp9/ratectrl_rtc.cc b/libvpx/vp9/ratectrl_rtc.cc
index f4d7f7e9e..02e50a857 100644
--- a/libvpx/vp9/ratectrl_rtc.cc
+++ b/libvpx/vp9/ratectrl_rtc.cc
@@ -158,6 +158,8 @@ void VP9RateControlRTC::ComputeQP(const VP9FrameParamsQpRTC &frame_params) {
   }
   vp9_set_mb_mi(cm, cm->width, cm->height);
   cm->frame_type = frame_params.frame_type;
+  // This is needed to ensure key frame does not get unset in rc_get_svc_params.
+  cpi_->frame_flags = (cm->frame_type == KEY_FRAME) ? FRAMEFLAGS_KEY : 0;
   cpi_->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
   cpi_->sf.use_nonrd_pick_mode = 1;
   if (cpi_->svc.number_spatial_layers == 1 &&
@@ -205,12 +207,16 @@ int VP9RateControlRTC::GetLoopfilterLevel() const {
   return lf->filter_level;
 }
 
-signed char *VP9RateControlRTC::GetCyclicRefreshMap() const {
-  return cpi_->cyclic_refresh->map;
-}
+bool VP9RateControlRTC::GetSegmentationData(
+    VP9SegmentationData *segmentation_data) const {
+  if (!cpi_->cyclic_refresh->apply_cyclic_refresh) return false;
 
-int *VP9RateControlRTC::GetDeltaQ() const {
-  return cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->segmentation_map = cpi_->segmentation_map;
+  segmentation_data->segmentation_map_size =
+      cpi_->common.mi_cols * cpi_->common.mi_rows;
+  segmentation_data->delta_q = cpi_->cyclic_refresh->qindex_delta;
+  segmentation_data->delta_q_size = 3u;
+  return true;
 }
 
 void VP9RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
diff --git a/libvpx/vp9/ratectrl_rtc.h b/libvpx/vp9/ratectrl_rtc.h
index d2b9417ae..b209e4db6 100644
--- a/libvpx/vp9/ratectrl_rtc.h
+++ b/libvpx/vp9/ratectrl_rtc.h
@@ -58,6 +58,13 @@ struct VP9FrameParamsQpRTC {
   int temporal_layer_id;
 };
 
+struct VP9SegmentationData {
+  const uint8_t *segmentation_map;
+  size_t segmentation_map_size;
+  const int *delta_q;
+  size_t delta_q_size;
+};
+
 // This interface allows using VP9 real-time rate control without initializing
 // the encoder. To use this interface, you need to link with libvpxrc.a.
 //
@@ -110,8 +117,7 @@ class VP9RateControlRTC {
   // GetQP() needs to be called after ComputeQP() to get the latest QP
   int GetQP() const;
   int GetLoopfilterLevel() const;
-  signed char *GetCyclicRefreshMap() const;
-  int *GetDeltaQ() const;
+  bool GetSegmentationData(VP9SegmentationData *segmentation_data) const;
   void ComputeQP(const VP9FrameParamsQpRTC &frame_params);
   // Feedback to rate control with the size of current encoded frame
   void PostEncodeUpdate(uint64_t encoded_frame_size);
diff --git a/libvpx/vp9/simple_encode.cc b/libvpx/vp9/simple_encode.cc
index 654699e1b..f42912d35 100644
--- a/libvpx/vp9/simple_encode.cc
+++ b/libvpx/vp9/simple_encode.cc
@@ -744,10 +744,12 @@ static void UpdateGroupOfPicture(const VP9_COMP *cpi, int start_coding_index,
 }
 
 #define SET_STRUCT_VALUE(config, structure, ret, field) \
-  if (strcmp(config.name, #field) == 0) {               \
-    structure->field = atoi(config.value);              \
-    ret = 1;                                            \
-  }
+  do {                                                  \
+    if (strcmp(config.name, #field) == 0) {             \
+      structure->field = atoi(config.value);            \
+      ret = 1;                                          \
+    }                                                   \
+  } while (false)
 
 static void UpdateEncodeConfig(const EncodeConfig &config,
                                VP9EncoderConfig *oxcf) {
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 05ac9e169..dee175dc0 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -170,8 +170,8 @@ static vpx_codec_err_t update_error_state(
 static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
                                        const vpx_codec_enc_cfg_t *cfg,
                                        const struct vp9_extracfg *extra_cfg) {
-  RANGE_CHECK(cfg, g_w, 1, 65535);  // 16 bits available
-  RANGE_CHECK(cfg, g_h, 1, 65535);  // 16 bits available
+  RANGE_CHECK(cfg, g_w, 1, 65536);  // 16 bits available
+  RANGE_CHECK(cfg, g_h, 1, 65536);  // 16 bits available
   RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000);
   RANGE_CHECK(cfg, g_timebase.num, 1, 1000000000);
   RANGE_CHECK_HI(cfg, g_profile, 3);
@@ -1014,6 +1014,7 @@ static vpx_codec_err_t ctrl_set_aq_mode(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
   extra_cfg.aq_mode = CAST(VP9E_SET_AQ_MODE, args);
+  if (ctx->cpi->fixed_qp_onepass) extra_cfg.aq_mode = 0;
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
@@ -1357,8 +1358,6 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     unsigned int lib_flags = 0;
     YV12_BUFFER_CONFIG sd;
     int64_t dst_time_stamp = timebase_units_to_ticks(timestamp_ratio, pts);
-    int64_t dst_end_time_stamp =
-        timebase_units_to_ticks(timestamp_ratio, pts + duration);
     size_t size, cx_data_sz;
     unsigned char *cx_data;
 
@@ -1369,6 +1368,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
     if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) cpi->b_calculate_psnr = 1;
 
     if (img != NULL) {
+      const int64_t dst_end_time_stamp =
+          timebase_units_to_ticks(timestamp_ratio, pts + duration);
       res = image2yuvconfig(img, &sd);
 
       // Store the original flags in to the frame buffer. Will extract the
@@ -1405,6 +1406,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
       // compute first pass stats
       if (img) {
         int ret;
+        int64_t dst_end_time_stamp;
         vpx_codec_cx_pkt_t fps_pkt;
         ENCODE_FRAME_RESULT encode_frame_result;
         vp9_init_encode_frame_result(&encode_frame_result);
@@ -1430,6 +1432,7 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 #endif  // !CONFIG_REALTIME_ONLY
     } else {
       ENCODE_FRAME_RESULT encode_frame_result;
+      int64_t dst_end_time_stamp;
       vp9_init_encode_frame_result(&encode_frame_result);
       while (cx_data_sz >= ctx->cx_data_sz / 2 &&
              -1 != vp9_get_compressed_data(cpi, &lib_flags, &size, cx_data,
@@ -1525,9 +1528,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
 
           cx_data += size;
           cx_data_sz -= size;
-          if (is_one_pass_cbr_svc(cpi) &&
-              (cpi->svc.spatial_layer_id ==
-               cpi->svc.number_spatial_layers - 1)) {
+          if (is_one_pass_svc(cpi) && (cpi->svc.spatial_layer_id ==
+                                       cpi->svc.number_spatial_layers - 1)) {
             // Encoded all spatial layers; exit loop.
             break;
           }
@@ -1950,6 +1952,24 @@ static vpx_codec_err_t ctrl_set_external_rate_control(vpx_codec_alg_priv_t *ctx,
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_quantizer_one_pass(vpx_codec_alg_priv_t *ctx,
+                                                   va_list args) {
+  VP9_COMP *const cpi = ctx->cpi;
+  const int qp = va_arg(args, int);
+  vpx_codec_enc_cfg_t *cfg = &ctx->cfg;
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  vpx_codec_err_t res;
+
+  if (qp < 0 || qp > 63) return VPX_CODEC_INVALID_PARAM;
+
+  cfg->rc_min_quantizer = cfg->rc_max_quantizer = qp;
+  extra_cfg.aq_mode = 0;
+  cpi->fixed_qp_onepass = 1;
+
+  res = update_extra_cfg(ctx, &extra_cfg);
+  return res;
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP8_COPY_REFERENCE, ctrl_copy_reference },
 
@@ -2004,6 +2024,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_DISABLE_LOOPFILTER, ctrl_set_disable_loopfilter },
   { VP9E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { VP9E_SET_EXTERNAL_RATE_CONTROL, ctrl_set_external_rate_control },
+  { VP9E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 3c42c7dfe..bdfe21793 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -334,7 +334,6 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
                                       const uint8_t *data, unsigned int data_sz,
                                       void *user_priv, long deadline) {
   const uint8_t *data_start = data;
-  const uint8_t *const data_end = data + data_sz;
   vpx_codec_err_t res;
   uint32_t frame_sizes[8];
   int frame_count;
@@ -362,6 +361,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
 
   // Decode in serial mode.
   if (frame_count > 0) {
+    const uint8_t *const data_end = data + data_sz;
     int i;
 
     for (i = 0; i < frame_count; ++i) {
@@ -379,6 +379,7 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx,
       data_start += frame_size;
     }
   } else {
+    const uint8_t *const data_end = data + data_sz;
     while (data_start < data_end) {
       const uint32_t frame_size = (uint32_t)(data_end - data_start);
       const vpx_codec_err_t res =
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 92a7fddb9..9072628f2 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -111,8 +111,10 @@ VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_constants.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_diamond_search_sad_neon.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_temporal_filter_sse4.c
@@ -121,10 +123,6 @@ endif
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
 
-ifeq ($(VPX_ARCH_X86_64),yes)
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm
-endif
-
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index a61238cb1..e0b679fbb 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -757,6 +757,16 @@ enum vp8e_enc_control_id {
    * Supported in codecs: VP8
    */
   VP8E_SET_RTC_EXTERNAL_RATECTRL,
+
+  /*!\brief Codec control to set quantizer for the next frame.
+   *
+   * This will turn off cyclic refresh. Only applicable to 1-pass without
+   * spatial layers.
+   *
+   * Supported in codecs: VP9
+   *
+   */
+  VP9E_SET_QUANTIZER_ONE_PASS,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -1085,6 +1095,8 @@ VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
 #define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
 VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
 #define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_SET_QUANTIZER_ONE_PASS, int)
+#define VPX_CTRL_VP9E_SET_QUANTIZER_ONE_PASS
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/libvpx/vpx/vpx_encoder.h b/libvpx/vpx/vpx_encoder.h
index 21254bb54..efaf5ef36 100644
--- a/libvpx/vpx/vpx_encoder.h
+++ b/libvpx/vpx/vpx_encoder.h
@@ -115,14 +115,14 @@ typedef int64_t vpx_codec_pts_t;
  * support frame types that are codec specific (MPEG-1 D-frames for example)
  */
 typedef uint32_t vpx_codec_frame_flags_t;
-#define VPX_FRAME_IS_KEY 0x1 /**< frame is the start of a GOP */
+#define VPX_FRAME_IS_KEY 0x1u /**< frame is the start of a GOP */
 /*!\brief frame can be dropped without affecting the stream (no future frame
  * depends on this one) */
-#define VPX_FRAME_IS_DROPPABLE 0x2
+#define VPX_FRAME_IS_DROPPABLE 0x2u
 /*!\brief frame should be decoded but will not be shown */
-#define VPX_FRAME_IS_INVISIBLE 0x4
+#define VPX_FRAME_IS_INVISIBLE 0x4u
 /*!\brief this is a fragment of the encoded frame */
-#define VPX_FRAME_IS_FRAGMENT 0x8
+#define VPX_FRAME_IS_FRAGMENT 0x8u
 
 /*!\brief Error Resilient flags
  *
@@ -132,12 +132,13 @@ typedef uint32_t vpx_codec_frame_flags_t;
  */
 typedef uint32_t vpx_codec_er_flags_t;
 /*!\brief Improve resiliency against losses of whole frames */
-#define VPX_ERROR_RESILIENT_DEFAULT 0x1
+#define VPX_ERROR_RESILIENT_DEFAULT 0x1u
 /*!\brief The frame partitions are independently decodable by the bool decoder,
  * meaning that partitions can be decoded even though earlier partitions have
  * been lost. Note that intra prediction is still done over the partition
- * boundary. */
-#define VPX_ERROR_RESILIENT_PARTITIONS 0x2
+ * boundary.
+ * \note This is only supported by VP8.*/
+#define VPX_ERROR_RESILIENT_PARTITIONS 0x2u
 
 /*!\brief Encoder output packet variants
  *
diff --git a/libvpx/vpx/vpx_ext_ratectrl.h b/libvpx/vpx/vpx_ext_ratectrl.h
index a193e5595..3c5fc8cfc 100644
--- a/libvpx/vpx/vpx_ext_ratectrl.h
+++ b/libvpx/vpx/vpx_ext_ratectrl.h
@@ -25,7 +25,27 @@ extern "C" {
  * types, removing or reassigning enums, adding/removing/rearranging
  * fields to structures.
  */
-#define VPX_EXT_RATECTRL_ABI_VERSION (1)
+#define VPX_EXT_RATECTRL_ABI_VERSION (6)
+
+/*!\brief The control type of the inference API.
+ * In VPX_RC_QP mode, the external rate control model determines the
+ * quantization parameter (QP) for each frame.
+ * In VPX_RC_GOP mode, the external rate control model determines the
+ * group of picture (GOP) of the video sequence.
+ * In VPX_RC_RDMULT mode, the external rate control model determines the
+ * rate-distortion multiplier (rdmult) for the current frame.
+ * In VPX_RC_GOP_QP mode, the external rate control model determines
+ * both the QP and the GOP.
+ * In VPX_RC_GOP_QP_RDMULT mode, the external rate control model determines
+ * the QP, GOP and the rdmult.
+ */
+typedef enum vpx_rc_type {
+  VPX_RC_QP = 1 << 0,
+  VPX_RC_GOP = 1 << 1,
+  VPX_RC_RDMULT = 1 << 2,
+  VPX_RC_GOP_QP = VPX_RC_QP | VPX_RC_GOP,
+  VPX_RC_GOP_QP_RDMULT = VPX_RC_QP | VPX_RC_GOP | VPX_RC_RDMULT
+} vpx_rc_type_t;
 
 /*!\brief Abstract rate control model handler
  *
@@ -34,11 +54,27 @@ extern "C" {
  */
 typedef void *vpx_rc_model_t;
 
+/*!\brief A reserved value for the q index.
+ * If the external rate control model returns this value,
+ * the encoder will use the default q selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_Q -1
+
+/*!\brief A reserved value for the rdmult.
+ * If the external rate control model returns this value,
+ * the encoder will use the default rdmult selected by libvpx's rate control
+ * system.
+ */
+#define VPX_DEFAULT_RDMULT -1
+
 /*!\brief Encode frame decision made by the external rate control model
  *
  * The encoder will receive the decision from the external rate control model
  * through get_encodeframe_decision() defined in vpx_rc_funcs_t.
  *
+ * If q_index = VPX_DEFAULT_Q, the encoder will use libvpx's default q.
+ *
  * If max_frame_size = 0, the encoding ignores max frame size limit.
  * If max_frame_size = -1, the encoding uses VP9's max frame size as the limit.
  * If the encoded frame size is larger than max_frame_size, the frame is
@@ -67,7 +103,7 @@ typedef struct vpx_rc_encodeframe_info {
   int show_index;   /**< display index, starts from zero*/
   int coding_index; /**< coding index, starts from zero*/
   /*!
-   * index in group of picture, starts from zero.
+   * index of the current frame in this group of picture, starts from zero.
    */
   int gop_index;
   int ref_frame_coding_indexes[3]; /**< three reference frames' coding indices*/
@@ -77,6 +113,14 @@ typedef struct vpx_rc_encodeframe_info {
    * 1: Valid
    */
   int ref_frame_valid_list[3];
+  /*!
+   * The length of the current GOP.
+   */
+  int gop_size;
+  /*!
+   * Whether the current GOP uses an alt ref.
+   */
+  int use_alt_ref;
 } vpx_rc_encodeframe_info_t;
 
 /*!\brief Frame coding result
@@ -258,6 +302,84 @@ typedef struct vpx_rc_config {
   int frame_rate_den; /**< denominator of frame rate */
 } vpx_rc_config_t;
 
+/*!\brief Information passed to the external rate control model to
+ * help make GOP decisions.
+ */
+typedef struct vpx_rc_gop_info {
+  /*!
+   * Minimum allowed gf interval, fixed for the whole clip.
+   * Note that it will be modified to match vp9's level constraints
+   * in the encoder.
+   * The level constraint is defined in vp9_encoder.c:
+   * const Vp9LevelSpec vp9_level_defs[VP9_LEVELS].
+   */
+  int min_gf_interval;
+  /*!
+   * Maximum allowed gf interval, fixed for the whole clip.
+   */
+  int max_gf_interval;
+  /*!
+   * Minimum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_min_gf_interval;
+  /*!
+   * Maximum allowed gf interval for the current GOP, determined
+   * by the encoder.
+   */
+  int active_max_gf_interval;
+  /*!
+   * Whether to allow the use of alt ref, determined by the encoder.
+   * It is fixed for the entire encode.
+   * See function "is_altref_enabled" in vp9_encoder.h.
+   */
+  int allow_alt_ref;
+  /*!
+   * Is the current frame a key frame.
+   */
+  int is_key_frame;
+  /*!
+   * Does the previous gop use alt ref or not.
+   */
+  int last_gop_use_alt_ref;
+  /*!
+   * Current frame distance to the last keyframe, e.g., if Nth frame is a key,
+   * then the value of the N+1 th frame is 1.
+   */
+  int frames_since_key;
+  /*!
+   * Current frame distance to the next keyframe, e.g. if Nth frame is a key,
+   * then the value of frame N - 1 is 1.
+   */
+  int frames_to_key;
+  /*!
+   * Number of lookahead source frames.
+   */
+  int lag_in_frames;
+  /*!
+   * Display index (temporal stamp) of this frame in the whole clip,
+   * starts from zero.
+   */
+  int show_index;
+  /*!
+   * Coding index of this frame in the whole clip, starts from zero.
+   */
+  int coding_index;
+  /*!
+   * The index of the current gop, starts from zero, resets to zero
+   * when a keyframe is set.
+   */
+  int gop_global_index;
+} vpx_rc_gop_info_t;
+
+/*!\brief The decision made by the external rate control model to set the
+ * group of picture.
+ */
+typedef struct vpx_rc_gop_decision {
+  int gop_coding_frames; /**< The number of frames of this GOP */
+  int use_alt_ref;       /**< Whether to use alt ref for this GOP */
+} vpx_rc_gop_decision_t;
+
 /*!\brief Create an external rate control model callback prototype
  *
  * This callback is invoked by the encoder to create an external rate control
@@ -310,6 +432,32 @@ typedef vpx_rc_status_t (*vpx_rc_update_encodeframe_result_cb_fn_t)(
     vpx_rc_model_t rate_ctrl_model,
     const vpx_rc_encodeframe_result_t *encode_frame_result);
 
+/*!\brief Get the GOP structure from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get GOP decisions from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  gop_info         information collected from the encoder
+ * \param[out] gop_decision     GOP decision from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_gop_decision_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_gop_info_t *gop_info,
+    vpx_rc_gop_decision_t *gop_decision);
+
+/*!\brief Get the frame rdmult from the external rate control model.
+ *
+ * This callback is invoked by the encoder to get rdmult from
+ * the external rate control model.
+ *
+ * \param[in]  rate_ctrl_model  rate control model
+ * \param[in]  frame_info       information collected from the encoder
+ * \param[out] rdmult           frame rate-distortion multiplier from the model
+ */
+typedef vpx_rc_status_t (*vpx_rc_get_frame_rdmult_cb_fn_t)(
+    vpx_rc_model_t rate_ctrl_model, const vpx_rc_encodeframe_info_t *frame_info,
+    int *rdmult);
+
 /*!\brief Delete the external rate control model callback prototype
  *
  * This callback is invoked by the encoder to delete the external rate control
@@ -328,6 +476,10 @@ typedef vpx_rc_status_t (*vpx_rc_delete_model_cb_fn_t)(
  */
 typedef struct vpx_rc_funcs {
   /*!
+   * The rate control type of this API.
+   */
+  vpx_rc_type_t rc_type;
+  /*!
    * Create an external rate control model.
    */
   vpx_rc_create_model_cb_fn_t create_model;
@@ -344,6 +496,14 @@ typedef struct vpx_rc_funcs {
    */
   vpx_rc_update_encodeframe_result_cb_fn_t update_encodeframe_result;
   /*!
+   * Get GOP decisions from the external rate control model.
+   */
+  vpx_rc_get_gop_decision_cb_fn_t get_gop_decision;
+  /*!
+   * Get rdmult for the frame from the external rate control model.
+   */
+  vpx_rc_get_frame_rdmult_cb_fn_t get_frame_rdmult;
+  /*!
    * Delete the external rate control model.
    */
   vpx_rc_delete_model_cb_fn_t delete_model;
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
index 67f43246a..a458ecaa4 100644
--- a/libvpx/vpx_dsp/arm/fdct16x16_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -35,22 +35,23 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp3[16];
 
   // Left half.
-  load(input, stride, temp0);
-  cross_input(temp0, temp1, 0);
-  vpx_fdct16x16_body(temp1, temp0);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp1);
+  vpx_fdct8x16_body(temp1, temp0);
 
   // Right half.
-  load(input + 8, stride, temp1);
-  cross_input(temp1, temp2, 0);
-  vpx_fdct16x16_body(temp2, temp1);
+  load_cross(input + 8, stride, temp1);
+  scale_input(temp1, temp2);
+  vpx_fdct8x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
-  transpose_8x8(&temp0[0], &temp2[0]);
-  transpose_8x8(&temp1[0], &temp2[8]);
+
+  transpose_s16_8x8_new(&temp0[0], &temp2[0]);
+  transpose_s16_8x8_new(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
-  cross_input(temp2, temp3, 1);
-  vpx_fdct16x16_body(temp3, temp2);
+  cross_input(temp2, temp3);
+  vpx_fdct8x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -61,12 +62,13 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
 
   // Transpose bottom left and bottom right quarters into one contiguous
   // location to process to the bottom half.
-  transpose_8x8(&temp0[8], &temp1[0]);
+  transpose_s16_8x8_new(&temp0[8], &temp1[0]);
+
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
-  cross_input(temp1, temp0, 1);
-  vpx_fdct16x16_body(temp0, temp1);
+  cross_input(temp1, temp0);
+  vpx_fdct8x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
@@ -74,5 +76,58 @@ void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp1);
   store(output + 8, temp1 + 8);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[16];
+  int32x4_t left1[16], left2[16], left3[16], left4[16], right1[16], right2[16],
+      right3[16], right4[16];
+
+  // Left half.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // right half.
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  // Transpose top left and top right quarters into one contiguous location to
+  // process to the top half.
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+
+  highbd_partial_round_shift(left3, right3);
+  highbd_cross_input(left3, right3, left1, right1);
+  vpx_highbd_fdct8x16_body(left1, right1);
+
+  // Transpose bottom left and bottom right quarters into one contiguous
+  // location to process to the bottom half.
+
+  highbd_partial_round_shift(left4, right4);
+  highbd_cross_input(left4, right4, left2, right2);
+  vpx_highbd_fdct8x16_body(left2, right2);
+
+  transpose_s32_8x8_2(left1, right1, left3, right3);
+  transpose_s32_8x8_2(left2, right2, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, left4, right4);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, left4 + 8, right4 + 8);
+  store16_s32(output, left3);
+  output += 4;
+  store16_s32(output, right3);
+  output += 4;
+
+  store16_s32(output, left4);
+  output += 4;
+  store16_s32(output, right4);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ == 9 && __GNUC_PATCHLEVEL__ < 4
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
index 0dd21153f..43d820b6b 100644
--- a/libvpx/vpx_dsp/arm/fdct16x16_neon.h
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -13,6 +13,8 @@
 
 #include <arm_neon.h>
 
+#include "fdct_neon.h"
+
 static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
   b[0] = vld1q_s16(a);
   a += stride;
@@ -72,45 +74,67 @@ static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
 // To maybe reduce register usage this could be combined with the load() step to
 // get the first 4 and last 4 values, cross those, then load the middle 8 values
 // and cross them.
+static INLINE void scale_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vshlq_n_s16(a[0], 2);
+  b[1] = vshlq_n_s16(a[1], 2);
+  b[2] = vshlq_n_s16(a[2], 2);
+  b[3] = vshlq_n_s16(a[3], 2);
+  b[4] = vshlq_n_s16(a[4], 2);
+  b[5] = vshlq_n_s16(a[5], 2);
+  b[6] = vshlq_n_s16(a[6], 2);
+  b[7] = vshlq_n_s16(a[7], 2);
+
+  b[8] = vshlq_n_s16(a[8], 2);
+  b[9] = vshlq_n_s16(a[9], 2);
+  b[10] = vshlq_n_s16(a[10], 2);
+  b[11] = vshlq_n_s16(a[11], 2);
+  b[12] = vshlq_n_s16(a[12], 2);
+  b[13] = vshlq_n_s16(a[13], 2);
+  b[14] = vshlq_n_s16(a[14], 2);
+  b[15] = vshlq_n_s16(a[15], 2);
+}
+
 static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
-
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
+                               int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+}
+
+static INLINE void load_cross(const int16_t *a, int stride,
+                              int16x8_t *b /*[16]*/) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+
+  b[8] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 8 * stride));
+  b[9] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 9 * stride));
+  b[10] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 10 * stride));
+  b[11] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 11 * stride));
+  b[12] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 12 * stride));
+  b[13] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 13 * stride));
+  b[14] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 14 * stride));
+  b[15] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 15 * stride));
 }
 
 // Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
@@ -135,84 +159,9 @@ static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
   a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
 }
 
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 // Main body of fdct16x16.
-static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
-                               int16x8_t *out /*[16]*/) {
+static void vpx_fdct8x16_body(const int16x8_t *in /*[16]*/,
+                              int16x8_t *out /*[16]*/) {
   int16x8_t s[8];
   int16x8_t x[4];
   int16x8_t step[8];
@@ -237,16 +186,17 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
 
   // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
   // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[8]);
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
   // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[4], &out[12]);
 
   //  Stage 2
   // Re-using source s5/s6
   // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
   // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &s[6], &s[5]);
 
   //  Stage 3
   x[0] = vaddq_s16(s[4], s[5]);
@@ -255,12 +205,12 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   x[3] = vaddq_s16(s[7], s[6]);
 
   // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[2], &out[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[10], &out[6]);
 
   // step 2
   // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
@@ -272,8 +222,8 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
   // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
   // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+  butterfly_one_coeff_s16_fast(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff_s16_fast(in[12], in[11], cospi_16_64, &s[4], &s[3]);
 
   // step 3
   s[0] = vaddq_s16(in[8], s[3]);
@@ -286,13 +236,15 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   s[7] = vaddq_s16(in[15], s[4]);
 
   // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_8_64, cospi_24_64, &s[6], &s[1]);
 
   // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_24_64, cospi_8_64, &s[2], &s[5]);
 
   // step 5
   step[0] = vaddq_s16(s[0], s[1]);
@@ -305,23 +257,368 @@ static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
   step[7] = vaddq_s16(s[7], s[6]);
 
   // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff(step[6], step[1], cospi_18_64, cospi_14_64, &out[9],
                       &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff(step[7], step[0], cospi_2_64, cospi_30_64, &out[1],
                       &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff(step[4], step[3], cospi_26_64, cospi_6_64, &out[13],
                       &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff(step[5], step[2], cospi_10_64, cospi_22_64, &out[5],
                       &out[11]);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[16]*/,
+                                      int32x4_t *left /*[16]*/,
+                                      int32x4_t *right /* [16] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[16]*/,
+                                      int32x4_t *a_right /*[16]*/,
+                                      int32x4_t *b_left /*[16]*/,
+                                      int32x4_t *b_right /*[16]*/) {
+  b_left[0] = vaddq_s32(a_left[0], a_left[15]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[14]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[13]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[12]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[11]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[10]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[9]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[8]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[15]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[14]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[13]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[12]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[11]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[10]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[9]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[8]);
+
+  b_left[8] = vsubq_s32(a_left[7], a_left[8]);
+  b_left[9] = vsubq_s32(a_left[6], a_left[9]);
+  b_left[10] = vsubq_s32(a_left[5], a_left[10]);
+  b_left[11] = vsubq_s32(a_left[4], a_left[11]);
+  b_left[12] = vsubq_s32(a_left[3], a_left[12]);
+  b_left[13] = vsubq_s32(a_left[2], a_left[13]);
+  b_left[14] = vsubq_s32(a_left[1], a_left[14]);
+  b_left[15] = vsubq_s32(a_left[0], a_left[15]);
+
+  b_right[8] = vsubq_s32(a_right[7], a_right[8]);
+  b_right[9] = vsubq_s32(a_right[6], a_right[9]);
+  b_right[10] = vsubq_s32(a_right[5], a_right[10]);
+  b_right[11] = vsubq_s32(a_right[4], a_right[11]);
+  b_right[12] = vsubq_s32(a_right[3], a_right[12]);
+  b_right[13] = vsubq_s32(a_right[2], a_right[13]);
+  b_right[14] = vsubq_s32(a_right[1], a_right[14]);
+  b_right[15] = vsubq_s32(a_right[0], a_right[15]);
+}
+
+static INLINE void highbd_partial_round_shift(int32x4_t *left /*[16]*/,
+                                              int32x4_t *right /* [16] */) {
+  const int32x4_t one = vdupq_n_s32(1);
+  left[0] = vshrq_n_s32(vaddq_s32(left[0], one), 2);
+  left[1] = vshrq_n_s32(vaddq_s32(left[1], one), 2);
+  left[2] = vshrq_n_s32(vaddq_s32(left[2], one), 2);
+  left[3] = vshrq_n_s32(vaddq_s32(left[3], one), 2);
+  left[4] = vshrq_n_s32(vaddq_s32(left[4], one), 2);
+  left[5] = vshrq_n_s32(vaddq_s32(left[5], one), 2);
+  left[6] = vshrq_n_s32(vaddq_s32(left[6], one), 2);
+  left[7] = vshrq_n_s32(vaddq_s32(left[7], one), 2);
+  left[8] = vshrq_n_s32(vaddq_s32(left[8], one), 2);
+  left[9] = vshrq_n_s32(vaddq_s32(left[9], one), 2);
+  left[10] = vshrq_n_s32(vaddq_s32(left[10], one), 2);
+  left[11] = vshrq_n_s32(vaddq_s32(left[11], one), 2);
+  left[12] = vshrq_n_s32(vaddq_s32(left[12], one), 2);
+  left[13] = vshrq_n_s32(vaddq_s32(left[13], one), 2);
+  left[14] = vshrq_n_s32(vaddq_s32(left[14], one), 2);
+  left[15] = vshrq_n_s32(vaddq_s32(left[15], one), 2);
+
+  right[0] = vshrq_n_s32(vaddq_s32(right[0], one), 2);
+  right[1] = vshrq_n_s32(vaddq_s32(right[1], one), 2);
+  right[2] = vshrq_n_s32(vaddq_s32(right[2], one), 2);
+  right[3] = vshrq_n_s32(vaddq_s32(right[3], one), 2);
+  right[4] = vshrq_n_s32(vaddq_s32(right[4], one), 2);
+  right[5] = vshrq_n_s32(vaddq_s32(right[5], one), 2);
+  right[6] = vshrq_n_s32(vaddq_s32(right[6], one), 2);
+  right[7] = vshrq_n_s32(vaddq_s32(right[7], one), 2);
+  right[8] = vshrq_n_s32(vaddq_s32(right[8], one), 2);
+  right[9] = vshrq_n_s32(vaddq_s32(right[9], one), 2);
+  right[10] = vshrq_n_s32(vaddq_s32(right[10], one), 2);
+  right[11] = vshrq_n_s32(vaddq_s32(right[11], one), 2);
+  right[12] = vshrq_n_s32(vaddq_s32(right[12], one), 2);
+  right[13] = vshrq_n_s32(vaddq_s32(right[13], one), 2);
+  right[14] = vshrq_n_s32(vaddq_s32(right[14], one), 2);
+  right[15] = vshrq_n_s32(vaddq_s32(right[15], one), 2);
+}
+
+// Store 16 32x4 vectors, assuming stride == 16.
+static INLINE void store16_s32(tran_low_t *a, const int32x4_t *b /*[32]*/) {
+  vst1q_s32(a, b[0]);
+  a += 16;
+  vst1q_s32(a, b[1]);
+  a += 16;
+  vst1q_s32(a, b[2]);
+  a += 16;
+  vst1q_s32(a, b[3]);
+  a += 16;
+  vst1q_s32(a, b[4]);
+  a += 16;
+  vst1q_s32(a, b[5]);
+  a += 16;
+  vst1q_s32(a, b[6]);
+  a += 16;
+  vst1q_s32(a, b[7]);
+  a += 16;
+  vst1q_s32(a, b[8]);
+  a += 16;
+  vst1q_s32(a, b[9]);
+  a += 16;
+  vst1q_s32(a, b[10]);
+  a += 16;
+  vst1q_s32(a, b[11]);
+  a += 16;
+  vst1q_s32(a, b[12]);
+  a += 16;
+  vst1q_s32(a, b[13]);
+  a += 16;
+  vst1q_s32(a, b[14]);
+  a += 16;
+  vst1q_s32(a, b[15]);
+}
+
+// Main body of fdct8x16 column
+static void vpx_highbd_fdct8x16_body(int32x4_t *left /*[16]*/,
+                                     int32x4_t *right /* [16] */) {
+  int32x4_t sl[8];
+  int32x4_t sr[8];
+  int32x4_t xl[4];
+  int32x4_t xr[4];
+  int32x4_t inl[8];
+  int32x4_t inr[8];
+  int32x4_t stepl[8];
+  int32x4_t stepr[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // Copy values 8-15 as we're storing in-place
+  inl[0] = left[8];
+  inr[0] = right[8];
+  inl[1] = left[9];
+  inr[1] = right[9];
+  inl[2] = left[10];
+  inr[2] = right[10];
+  inl[3] = left[11];
+  inr[3] = right[11];
+  inl[4] = left[12];
+  inr[4] = right[12];
+  inl[5] = left[13];
+  inr[5] = right[13];
+  inl[6] = left[14];
+  inr[6] = right[14];
+  inl[7] = left[15];
+  inr[7] = right[15];
+
+  // fdct4(step, step);
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[8], &right[8]);
+
+  // out[4]  = fdct_round_shift(x3 * cospi_8_64  + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[4], &right[4],
+                                     &left[12], &right[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &sl[6],
+                               &sr[6], &sl[5], &sr[5]);
+
+  //  Stage 3
+  xl[0] = vaddq_s32(sl[4], sl[5]);
+  xr[0] = vaddq_s32(sr[4], sr[5]);
+  xl[1] = vsubq_s32(sl[4], sl[5]);
+  xr[1] = vsubq_s32(sr[4], sr[5]);
+  xl[2] = vsubq_s32(sl[7], sl[6]);
+  xr[2] = vsubq_s32(sr[7], sr[6]);
+  xl[3] = vaddq_s32(sl[7], sl[6]);
+  xr[3] = vaddq_s32(sr[7], sr[6]);
+
+  // Stage 4
+  // out[2]  = fdct_round_shift(x3 * cospi_4_64  + x0 * cospi_28_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 - x0 * cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[2], &right[2],
+                                     &left[14], &right[14]);
+  // out[6]  = fdct_round_shift(x2 * cospi_20_64 + x1 * cospi_12_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 - x1 * cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[10], &right[10],
+                                     &left[6], &right[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(inl[5], inr[5], inl[2], inr[2], cospi_16_64,
+                               &sl[5], &sr[5], &sl[2], &sr[2]);
+  butterfly_one_coeff_s32_fast(inl[4], inr[4], inl[3], inr[3], cospi_16_64,
+                               &sl[4], &sr[4], &sl[3], &sr[3]);
+
+  // step 3
+  sl[0] = vaddq_s32(inl[0], sl[3]);
+  sr[0] = vaddq_s32(inr[0], sr[3]);
+  sl[1] = vaddq_s32(inl[1], sl[2]);
+  sr[1] = vaddq_s32(inr[1], sr[2]);
+  xl[0] = vsubq_s32(inl[1], sl[2]);
+  xr[0] = vsubq_s32(inr[1], sr[2]);
+  xl[1] = vsubq_s32(inl[0], sl[3]);
+  xr[1] = vsubq_s32(inr[0], sr[3]);
+  xl[2] = vsubq_s32(inl[7], sl[4]);
+  xr[2] = vsubq_s32(inr[7], sr[4]);
+  xl[3] = vsubq_s32(inl[6], sl[5]);
+  xr[3] = vsubq_s32(inr[6], sr[5]);
+  sl[6] = vaddq_s32(inl[6], sl[5]);
+  sr[6] = vaddq_s32(inr[6], sr[5]);
+  sl[7] = vaddq_s32(inl[7], sl[4]);
+  sr[7] = vaddq_s32(inr[7], sr[4]);
+
+  // step 4
+  // step2[6] = fdct_round_shift(step3[6] * cospi_8_64  + step3[1] *
+  // cospi_24_64) step2[1] = fdct_round_shift(step3[6] * cospi_24_64 - step3[1]
+  // * cospi_8_64)
+  butterfly_two_coeff_s32_s64_narrow(sl[6], sr[6], sl[1], sr[1], cospi_8_64,
+                                     cospi_24_64, &sl[6], &sr[6], &sl[1],
+                                     &sr[1]);
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64  - step3[5] *
+  // cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[0], xr[0], xl[3], xr[3], cospi_24_64,
+                                     cospi_8_64, &sl[2], &sr[2], &sl[5],
+                                     &sr[5]);
+
+  // step 5
+  stepl[0] = vaddq_s32(sl[0], sl[1]);
+  stepr[0] = vaddq_s32(sr[0], sr[1]);
+  stepl[1] = vsubq_s32(sl[0], sl[1]);
+  stepr[1] = vsubq_s32(sr[0], sr[1]);
+  stepl[2] = vaddq_s32(xl[1], sl[2]);
+  stepr[2] = vaddq_s32(xr[1], sr[2]);
+  stepl[3] = vsubq_s32(xl[1], sl[2]);
+  stepr[3] = vsubq_s32(xr[1], sr[2]);
+  stepl[4] = vsubq_s32(xl[2], sl[5]);
+  stepr[4] = vsubq_s32(xr[2], sr[5]);
+  stepl[5] = vaddq_s32(xl[2], sl[5]);
+  stepr[5] = vaddq_s32(xr[2], sr[5]);
+  stepl[6] = vsubq_s32(sl[7], sl[6]);
+  stepr[6] = vsubq_s32(sr[7], sr[6]);
+  stepl[7] = vaddq_s32(sl[7], sl[6]);
+  stepr[7] = vaddq_s32(sr[7], sr[6]);
+
+  // step 6
+  // out[9] = fdct_round_shift(step1[6] * cospi_18_64 + step1[1] * cospi_14_64)
+  // out[7] = fdct_round_shift(step1[6] * cospi_14_64 - step1[1] * cospi_18_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[6], stepr[6], stepl[1], stepr[1],
+                                     cospi_18_64, cospi_14_64, &left[9],
+                                     &right[9], &left[7], &right[7]);
+  // out[1]  = fdct_round_shift(step1[7] * cospi_2_64  + step1[0] * cospi_30_64)
+  // out[15] = fdct_round_shift(step1[7] * cospi_30_64 - step1[0] * cospi_2_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[7], stepr[7], stepl[0], stepr[0],
+                                     cospi_2_64, cospi_30_64, &left[1],
+                                     &right[1], &left[15], &right[15]);
+  // out[13] = fdct_round_shift(step1[4] * cospi_26_64 + step1[3] * cospi_6_64)
+  // out[3]  = fdct_round_shift(step1[4] * cospi_6_64  - step1[3] * cospi_26_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[4], stepr[4], stepl[3], stepr[3],
+                                     cospi_26_64, cospi_6_64, &left[13],
+                                     &right[13], &left[3], &right[3]);
+  // out[5]  = fdct_round_shift(step1[5] * cospi_10_64 + step1[2] * cospi_22_64)
+  // out[11] = fdct_round_shift(step1[5] * cospi_22_64 - step1[2] * cospi_10_64)
+  butterfly_two_coeff_s32_s64_narrow(stepl[5], stepr[5], stepl[2], stepr[2],
+                                     cospi_10_64, cospi_22_64, &left[5],
+                                     &right[5], &left[11], &right[11]);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
index de74e6630..d6818d2ec 100644
--- a/libvpx/vpx_dsp/arm/fdct32x32_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -15,6 +15,8 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/fdct32x32_neon.h"
 
 // Most gcc 4.9 distributions outside of Android do not generate correct code
 // for this function.
@@ -32,1289 +34,6 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
 
 #else
 
-#define LOAD_INCREMENT(src, stride, dest, index) \
-  do {                                           \
-    dest[index] = vld1q_s16(src);                \
-    src += stride;                               \
-  } while (0)
-
-#define ADD_S16(src, index0, index1, dest, index3)      \
-  do {                                                  \
-    dest[index3] = vaddq_s16(src[index0], src[index1]); \
-  } while (0)
-
-#define ADD_SHIFT_S16(src, index0, index1)                             \
-  do {                                                                 \
-    src[index1] = vshlq_n_s16(vsubq_s16(src[index0], src[index1]), 2); \
-  } while (0)
-
-// Load, cross, and multiply by 4. Load the first 8 and last 8, then the
-// middle
-// 16. Doing sets of 16 at a time. Maybe sets of 8 would be better?
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b) {
-  const int16_t *a_end = a + 24 * stride;
-  int16x8_t c[8];
-
-  LOAD_INCREMENT(a, stride, b, 0);
-  LOAD_INCREMENT(a, stride, b, 1);
-  LOAD_INCREMENT(a, stride, b, 2);
-  LOAD_INCREMENT(a, stride, b, 3);
-  LOAD_INCREMENT(a, stride, b, 4);
-  LOAD_INCREMENT(a, stride, b, 5);
-  LOAD_INCREMENT(a, stride, b, 6);
-  LOAD_INCREMENT(a, stride, b, 7);
-
-  LOAD_INCREMENT(a_end, stride, b, 24);
-  LOAD_INCREMENT(a_end, stride, b, 25);
-  LOAD_INCREMENT(a_end, stride, b, 26);
-  LOAD_INCREMENT(a_end, stride, b, 27);
-  LOAD_INCREMENT(a_end, stride, b, 28);
-  LOAD_INCREMENT(a_end, stride, b, 29);
-  LOAD_INCREMENT(a_end, stride, b, 30);
-  LOAD_INCREMENT(a_end, stride, b, 31);
-
-  ADD_S16(b, 0, 31, c, 0);
-  ADD_S16(b, 1, 30, c, 1);
-  ADD_S16(b, 2, 29, c, 2);
-  ADD_S16(b, 3, 28, c, 3);
-  ADD_S16(b, 4, 27, c, 4);
-  ADD_S16(b, 5, 26, c, 5);
-  ADD_S16(b, 6, 25, c, 6);
-  ADD_S16(b, 7, 24, c, 7);
-
-  ADD_SHIFT_S16(b, 7, 24);
-  ADD_SHIFT_S16(b, 6, 25);
-  ADD_SHIFT_S16(b, 5, 26);
-  ADD_SHIFT_S16(b, 4, 27);
-  ADD_SHIFT_S16(b, 3, 28);
-  ADD_SHIFT_S16(b, 2, 29);
-  ADD_SHIFT_S16(b, 1, 30);
-  ADD_SHIFT_S16(b, 0, 31);
-
-  b[0] = vshlq_n_s16(c[0], 2);
-  b[1] = vshlq_n_s16(c[1], 2);
-  b[2] = vshlq_n_s16(c[2], 2);
-  b[3] = vshlq_n_s16(c[3], 2);
-  b[4] = vshlq_n_s16(c[4], 2);
-  b[5] = vshlq_n_s16(c[5], 2);
-  b[6] = vshlq_n_s16(c[6], 2);
-  b[7] = vshlq_n_s16(c[7], 2);
-
-  LOAD_INCREMENT(a, stride, b, 8);
-  LOAD_INCREMENT(a, stride, b, 9);
-  LOAD_INCREMENT(a, stride, b, 10);
-  LOAD_INCREMENT(a, stride, b, 11);
-  LOAD_INCREMENT(a, stride, b, 12);
-  LOAD_INCREMENT(a, stride, b, 13);
-  LOAD_INCREMENT(a, stride, b, 14);
-  LOAD_INCREMENT(a, stride, b, 15);
-  LOAD_INCREMENT(a, stride, b, 16);
-  LOAD_INCREMENT(a, stride, b, 17);
-  LOAD_INCREMENT(a, stride, b, 18);
-  LOAD_INCREMENT(a, stride, b, 19);
-  LOAD_INCREMENT(a, stride, b, 20);
-  LOAD_INCREMENT(a, stride, b, 21);
-  LOAD_INCREMENT(a, stride, b, 22);
-  LOAD_INCREMENT(a, stride, b, 23);
-
-  ADD_S16(b, 8, 23, c, 0);
-  ADD_S16(b, 9, 22, c, 1);
-  ADD_S16(b, 10, 21, c, 2);
-  ADD_S16(b, 11, 20, c, 3);
-  ADD_S16(b, 12, 19, c, 4);
-  ADD_S16(b, 13, 18, c, 5);
-  ADD_S16(b, 14, 17, c, 6);
-  ADD_S16(b, 15, 16, c, 7);
-
-  ADD_SHIFT_S16(b, 15, 16);
-  ADD_SHIFT_S16(b, 14, 17);
-  ADD_SHIFT_S16(b, 13, 18);
-  ADD_SHIFT_S16(b, 12, 19);
-  ADD_SHIFT_S16(b, 11, 20);
-  ADD_SHIFT_S16(b, 10, 21);
-  ADD_SHIFT_S16(b, 9, 22);
-  ADD_SHIFT_S16(b, 8, 23);
-
-  b[8] = vshlq_n_s16(c[0], 2);
-  b[9] = vshlq_n_s16(c[1], 2);
-  b[10] = vshlq_n_s16(c[2], 2);
-  b[11] = vshlq_n_s16(c[3], 2);
-  b[12] = vshlq_n_s16(c[4], 2);
-  b[13] = vshlq_n_s16(c[5], 2);
-  b[14] = vshlq_n_s16(c[6], 2);
-  b[15] = vshlq_n_s16(c[7], 2);
-}
-
-#undef LOAD_INCREMENT
-#undef ADD_S16
-#undef ADD_SHIFT_S16
-
-#define STORE_S16(src, index, dest)           \
-  do {                                        \
-    store_s16q_to_tran_low(dest, src[index]); \
-    dest += 8;                                \
-  } while (0)
-
-// Store 32 16x8 values, assuming stride == 32.
-// Slight twist: store horizontally in blocks of 8.
-static INLINE void store(tran_low_t *a, const int16x8_t *b) {
-  STORE_S16(b, 0, a);
-  STORE_S16(b, 8, a);
-  STORE_S16(b, 16, a);
-  STORE_S16(b, 24, a);
-  STORE_S16(b, 1, a);
-  STORE_S16(b, 9, a);
-  STORE_S16(b, 17, a);
-  STORE_S16(b, 25, a);
-  STORE_S16(b, 2, a);
-  STORE_S16(b, 10, a);
-  STORE_S16(b, 18, a);
-  STORE_S16(b, 26, a);
-  STORE_S16(b, 3, a);
-  STORE_S16(b, 11, a);
-  STORE_S16(b, 19, a);
-  STORE_S16(b, 27, a);
-  STORE_S16(b, 4, a);
-  STORE_S16(b, 12, a);
-  STORE_S16(b, 20, a);
-  STORE_S16(b, 28, a);
-  STORE_S16(b, 5, a);
-  STORE_S16(b, 13, a);
-  STORE_S16(b, 21, a);
-  STORE_S16(b, 29, a);
-  STORE_S16(b, 6, a);
-  STORE_S16(b, 14, a);
-  STORE_S16(b, 22, a);
-  STORE_S16(b, 30, a);
-  STORE_S16(b, 7, a);
-  STORE_S16(b, 15, a);
-  STORE_S16(b, 23, a);
-  STORE_S16(b, 31, a);
-}
-
-#undef STORE_S16
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t constant,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t constant0,
-                                       const tran_coef_t constant1,
-                                       int16x8_t *add, int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), constant1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), constant1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), constant0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), constant0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Add 2 if positive, 1 if negative, and shift by 2.
-// In practice, subtract the sign bit, then shift with rounding.
-static INLINE int16x8_t sub_round_shift(const int16x8_t a) {
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
-}
-
-static void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1: Done as part of the load.
-
-  // Stage 2.
-  // Mini cross. X the first 16 values and the middle 8 of the second half.
-  a[0] = vaddq_s16(in[0], in[15]);
-  a[1] = vaddq_s16(in[1], in[14]);
-  a[2] = vaddq_s16(in[2], in[13]);
-  a[3] = vaddq_s16(in[3], in[12]);
-  a[4] = vaddq_s16(in[4], in[11]);
-  a[5] = vaddq_s16(in[5], in[10]);
-  a[6] = vaddq_s16(in[6], in[9]);
-  a[7] = vaddq_s16(in[7], in[8]);
-
-  a[8] = vsubq_s16(in[7], in[8]);
-  a[9] = vsubq_s16(in[6], in[9]);
-  a[10] = vsubq_s16(in[5], in[10]);
-  a[11] = vsubq_s16(in[4], in[11]);
-  a[12] = vsubq_s16(in[3], in[12]);
-  a[13] = vsubq_s16(in[2], in[13]);
-  a[14] = vsubq_s16(in[1], in[14]);
-  a[15] = vsubq_s16(in[0], in[15]);
-
-  a[16] = in[16];
-  a[17] = in[17];
-  a[18] = in[18];
-  a[19] = in[19];
-
-  butterfly_one_coeff(in[27], in[20], cospi_16_64, &a[27], &a[20]);
-  butterfly_one_coeff(in[26], in[21], cospi_16_64, &a[26], &a[21]);
-  butterfly_one_coeff(in[25], in[22], cospi_16_64, &a[25], &a[22]);
-  butterfly_one_coeff(in[24], in[23], cospi_16_64, &a[24], &a[23]);
-
-  a[28] = in[28];
-  a[29] = in[29];
-  a[30] = in[30];
-  a[31] = in[31];
-
-  // Stage 3.
-  b[0] = vaddq_s16(a[0], a[7]);
-  b[1] = vaddq_s16(a[1], a[6]);
-  b[2] = vaddq_s16(a[2], a[5]);
-  b[3] = vaddq_s16(a[3], a[4]);
-
-  b[4] = vsubq_s16(a[3], a[4]);
-  b[5] = vsubq_s16(a[2], a[5]);
-  b[6] = vsubq_s16(a[1], a[6]);
-  b[7] = vsubq_s16(a[0], a[7]);
-
-  b[8] = a[8];
-  b[9] = a[9];
-
-  butterfly_one_coeff(a[13], a[10], cospi_16_64, &b[13], &b[10]);
-  butterfly_one_coeff(a[12], a[11], cospi_16_64, &b[12], &b[11]);
-
-  b[14] = a[14];
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(in[16], a[23]);
-  b[17] = vaddq_s16(in[17], a[22]);
-  b[18] = vaddq_s16(in[18], a[21]);
-  b[19] = vaddq_s16(in[19], a[20]);
-
-  b[20] = vsubq_s16(in[19], a[20]);
-  b[21] = vsubq_s16(in[18], a[21]);
-  b[22] = vsubq_s16(in[17], a[22]);
-  b[23] = vsubq_s16(in[16], a[23]);
-
-  b[24] = vsubq_s16(in[31], a[24]);
-  b[25] = vsubq_s16(in[30], a[25]);
-  b[26] = vsubq_s16(in[29], a[26]);
-  b[27] = vsubq_s16(in[28], a[27]);
-
-  b[28] = vaddq_s16(in[28], a[27]);
-  b[29] = vaddq_s16(in[29], a[26]);
-  b[30] = vaddq_s16(in[30], a[25]);
-  b[31] = vaddq_s16(in[31], a[24]);
-
-  // Stage 4.
-  a[0] = vaddq_s16(b[0], b[3]);
-  a[1] = vaddq_s16(b[1], b[2]);
-  a[2] = vsubq_s16(b[1], b[2]);
-  a[3] = vsubq_s16(b[0], b[3]);
-
-  a[4] = b[4];
-
-  butterfly_one_coeff(b[6], b[5], cospi_16_64, &a[6], &a[5]);
-
-  a[7] = b[7];
-
-  a[8] = vaddq_s16(b[8], b[11]);
-  a[9] = vaddq_s16(b[9], b[10]);
-  a[10] = vsubq_s16(b[9], b[10]);
-  a[11] = vsubq_s16(b[8], b[11]);
-  a[12] = vsubq_s16(b[15], b[12]);
-  a[13] = vsubq_s16(b[14], b[13]);
-  a[14] = vaddq_s16(b[14], b[13]);
-  a[15] = vaddq_s16(b[15], b[12]);
-
-  a[16] = b[16];
-  a[17] = b[17];
-
-  butterfly_two_coeff(b[29], b[18], cospi_24_64, cospi_8_64, &a[29], &a[18]);
-  butterfly_two_coeff(b[28], b[19], cospi_24_64, cospi_8_64, &a[28], &a[19]);
-  butterfly_two_coeff(b[27], b[20], -cospi_8_64, cospi_24_64, &a[27], &a[20]);
-  butterfly_two_coeff(b[26], b[21], -cospi_8_64, cospi_24_64, &a[26], &a[21]);
-
-  a[22] = b[22];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[25] = b[25];
-
-  a[30] = b[30];
-  a[31] = b[31];
-
-  // Stage 5.
-  butterfly_one_coeff(a[0], a[1], cospi_16_64, &b[0], &b[1]);
-  butterfly_two_coeff(a[3], a[2], cospi_24_64, cospi_8_64, &b[2], &b[3]);
-
-  b[4] = vaddq_s16(a[4], a[5]);
-  b[5] = vsubq_s16(a[4], a[5]);
-  b[6] = vsubq_s16(a[7], a[6]);
-  b[7] = vaddq_s16(a[7], a[6]);
-
-  b[8] = a[8];
-
-  butterfly_two_coeff(a[14], a[9], cospi_24_64, cospi_8_64, &b[14], &b[9]);
-  butterfly_two_coeff(a[13], a[10], -cospi_8_64, cospi_24_64, &b[13], &b[10]);
-
-  b[11] = a[11];
-  b[12] = a[12];
-
-  b[15] = a[15];
-
-  b[16] = vaddq_s16(a[19], a[16]);
-  b[17] = vaddq_s16(a[18], a[17]);
-  b[18] = vsubq_s16(a[17], a[18]);
-  b[19] = vsubq_s16(a[16], a[19]);
-  b[20] = vsubq_s16(a[23], a[20]);
-  b[21] = vsubq_s16(a[22], a[21]);
-  b[22] = vaddq_s16(a[21], a[22]);
-  b[23] = vaddq_s16(a[20], a[23]);
-  b[24] = vaddq_s16(a[27], a[24]);
-  b[25] = vaddq_s16(a[26], a[25]);
-  b[26] = vsubq_s16(a[25], a[26]);
-  b[27] = vsubq_s16(a[24], a[27]);
-  b[28] = vsubq_s16(a[31], a[28]);
-  b[29] = vsubq_s16(a[30], a[29]);
-  b[30] = vaddq_s16(a[29], a[30]);
-  b[31] = vaddq_s16(a[28], a[31]);
-
-  // Stage 6.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-
-  butterfly_two_coeff(b[7], b[4], cospi_28_64, cospi_4_64, &a[4], &a[7]);
-  butterfly_two_coeff(b[6], b[5], cospi_12_64, cospi_20_64, &a[5], &a[6]);
-
-  a[8] = vaddq_s16(b[8], b[9]);
-  a[9] = vsubq_s16(b[8], b[9]);
-  a[10] = vsubq_s16(b[11], b[10]);
-  a[11] = vaddq_s16(b[11], b[10]);
-  a[12] = vaddq_s16(b[12], b[13]);
-  a[13] = vsubq_s16(b[12], b[13]);
-  a[14] = vsubq_s16(b[15], b[14]);
-  a[15] = vaddq_s16(b[15], b[14]);
-
-  a[16] = b[16];
-  a[19] = b[19];
-  a[20] = b[20];
-  a[23] = b[23];
-  a[24] = b[24];
-  a[27] = b[27];
-  a[28] = b[28];
-  a[31] = b[31];
-
-  butterfly_two_coeff(b[30], b[17], cospi_28_64, cospi_4_64, &a[30], &a[17]);
-  butterfly_two_coeff(b[29], b[18], -cospi_4_64, cospi_28_64, &a[29], &a[18]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_12_64, cospi_20_64, &a[26], &a[21]);
-  butterfly_two_coeff(b[25], b[22], -cospi_20_64, cospi_12_64, &a[25], &a[22]);
-
-  // Stage 7.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-  b[4] = a[4];
-  b[5] = a[5];
-  b[6] = a[6];
-  b[7] = a[7];
-
-  butterfly_two_coeff(a[15], a[8], cospi_30_64, cospi_2_64, &b[8], &b[15]);
-  butterfly_two_coeff(a[14], a[9], cospi_14_64, cospi_18_64, &b[9], &b[14]);
-  butterfly_two_coeff(a[13], a[10], cospi_22_64, cospi_10_64, &b[10], &b[13]);
-  butterfly_two_coeff(a[12], a[11], cospi_6_64, cospi_26_64, &b[11], &b[12]);
-
-  b[16] = vaddq_s16(a[16], a[17]);
-  b[17] = vsubq_s16(a[16], a[17]);
-  b[18] = vsubq_s16(a[19], a[18]);
-  b[19] = vaddq_s16(a[19], a[18]);
-  b[20] = vaddq_s16(a[20], a[21]);
-  b[21] = vsubq_s16(a[20], a[21]);
-  b[22] = vsubq_s16(a[23], a[22]);
-  b[23] = vaddq_s16(a[23], a[22]);
-  b[24] = vaddq_s16(a[24], a[25]);
-  b[25] = vsubq_s16(a[24], a[25]);
-  b[26] = vsubq_s16(a[27], a[26]);
-  b[27] = vaddq_s16(a[27], a[26]);
-  b[28] = vaddq_s16(a[28], a[29]);
-  b[29] = vsubq_s16(a[28], a[29]);
-  b[30] = vsubq_s16(a[31], a[30]);
-  b[31] = vaddq_s16(a[31], a[30]);
-
-  // Final stage.
-  // Also compute partial rounding shift:
-  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  out[0] = sub_round_shift(b[0]);
-  out[16] = sub_round_shift(b[1]);
-  out[8] = sub_round_shift(b[2]);
-  out[24] = sub_round_shift(b[3]);
-  out[4] = sub_round_shift(b[4]);
-  out[20] = sub_round_shift(b[5]);
-  out[12] = sub_round_shift(b[6]);
-  out[28] = sub_round_shift(b[7]);
-  out[2] = sub_round_shift(b[8]);
-  out[18] = sub_round_shift(b[9]);
-  out[10] = sub_round_shift(b[10]);
-  out[26] = sub_round_shift(b[11]);
-  out[6] = sub_round_shift(b[12]);
-  out[22] = sub_round_shift(b[13]);
-  out[14] = sub_round_shift(b[14]);
-  out[30] = sub_round_shift(b[15]);
-
-  butterfly_two_coeff(b[31], b[16], cospi_31_64, cospi_1_64, &a[1], &a[31]);
-  out[1] = sub_round_shift(a[1]);
-  out[31] = sub_round_shift(a[31]);
-
-  butterfly_two_coeff(b[30], b[17], cospi_15_64, cospi_17_64, &a[17], &a[15]);
-  out[17] = sub_round_shift(a[17]);
-  out[15] = sub_round_shift(a[15]);
-
-  butterfly_two_coeff(b[29], b[18], cospi_23_64, cospi_9_64, &a[9], &a[23]);
-  out[9] = sub_round_shift(a[9]);
-  out[23] = sub_round_shift(a[23]);
-
-  butterfly_two_coeff(b[28], b[19], cospi_7_64, cospi_25_64, &a[25], &a[7]);
-  out[25] = sub_round_shift(a[25]);
-  out[7] = sub_round_shift(a[7]);
-
-  butterfly_two_coeff(b[27], b[20], cospi_27_64, cospi_5_64, &a[5], &a[27]);
-  out[5] = sub_round_shift(a[5]);
-  out[27] = sub_round_shift(a[27]);
-
-  butterfly_two_coeff(b[26], b[21], cospi_11_64, cospi_21_64, &a[21], &a[11]);
-  out[21] = sub_round_shift(a[21]);
-  out[11] = sub_round_shift(a[11]);
-
-  butterfly_two_coeff(b[25], b[22], cospi_19_64, cospi_13_64, &a[13], &a[19]);
-  out[13] = sub_round_shift(a[13]);
-  out[19] = sub_round_shift(a[19]);
-
-  butterfly_two_coeff(b[24], b[23], cospi_3_64, cospi_29_64, &a[29], &a[3]);
-  out[29] = sub_round_shift(a[29]);
-  out[3] = sub_round_shift(a[3]);
-}
-
-#define PASS_THROUGH(src, dst, element)    \
-  do {                                     \
-    dst##_lo[element] = src##_lo[element]; \
-    dst##_hi[element] = src##_hi[element]; \
-  } while (0)
-
-#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                        \
-    b##_lo[b_index] =                                                         \
-        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
-    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
-                                vget_high_s16(a[right_index]));               \
-  } while (0)
-
-#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
-  do {                                                                       \
-    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
-    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
-  } while (0)
-
-#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
-  do {                                                                     \
-    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
-    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
-    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
-    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
-  } while (0)
-
-#define ADD_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-#define SUB_S32(a, left_index, right_index, b, b_index)                   \
-  do {                                                                    \
-    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
-    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
-  } while (0)
-
-// Like butterfly_one_coeff, but don't narrow results.
-static INLINE void butterfly_one_coeff_s16_s32(
-    const int16x8_t a, const int16x8_t b, const tran_high_t constant,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
-                              add_index, sub_index)                      \
-  do {                                                                   \
-    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
-                                &b##_lo[add_index], &b##_hi[add_index],  \
-                                &b##_lo[sub_index], &b##_hi[sub_index]); \
-  } while (0)
-
-// Like butterfly_one_coeff, but with s32.
-static INLINE void butterfly_one_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant, int32x4_t *add_lo,
-    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
-  const int32x4_t a_lo_0 = vmulq_n_s32(a_lo, constant);
-  const int32x4_t a_hi_0 = vmulq_n_s32(a_hi, constant);
-  const int32x4_t sum0 = vmlaq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t sum1 = vmlaq_n_s32(a_hi_0, b_hi, constant);
-  const int32x4_t diff0 = vmlsq_n_s32(a_lo_0, b_lo, constant);
-  const int32x4_t diff1 = vmlsq_n_s32(a_hi_0, b_hi, constant);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index, \
-                          sub_index)                                          \
-  do {                                                                        \
-    butterfly_one_coeff_s32(a##_lo[left_index], a##_hi[left_index],           \
-                            a##_lo[right_index], a##_hi[right_index],         \
-                            constant, &b##_lo[add_index], &b##_hi[add_index], \
-                            &b##_lo[sub_index], &b##_hi[sub_index]);          \
-  } while (0)
-
-// Like butterfly_two_coeff, but with s32.
-static INLINE void butterfly_two_coeff_s32(
-    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
-    const int32x4_t b_hi, const int32_t constant0, const int32_t constant1,
-    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
-    int32x4_t *sub_hi) {
-  const int32x4_t a0 = vmulq_n_s32(a_lo, constant0);
-  const int32x4_t a1 = vmulq_n_s32(a_hi, constant0);
-  const int32x4_t a2 = vmulq_n_s32(a_lo, constant1);
-  const int32x4_t a3 = vmulq_n_s32(a_hi, constant1);
-  const int32x4_t sum0 = vmlaq_n_s32(a2, b_lo, constant0);
-  const int32x4_t sum1 = vmlaq_n_s32(a3, b_hi, constant0);
-  const int32x4_t diff0 = vmlsq_n_s32(a0, b_lo, constant1);
-  const int32x4_t diff1 = vmlsq_n_s32(a1, b_hi, constant1);
-  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
-  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
-  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
-  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
-}
-
-#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
-                          right_constant, b, add_index, sub_index)             \
-  do {                                                                         \
-    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
-                            a##_lo[right_index], a##_hi[right_index],          \
-                            left_constant, right_constant, &b##_lo[add_index], \
-                            &b##_hi[add_index], &b##_lo[sub_index],            \
-                            &b##_hi[sub_index]);                               \
-  } while (0)
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s32(const int32x4_t a_lo,
-                                            const int32x4_t a_hi) {
-  const int32x4_t one = vdupq_n_s32(1);
-  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
-  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
-  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
-  const int16x4_t b_lo =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
-  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
-  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
-  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
-  const int16x4_t b_hi =
-      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
-  return vcombine_s16(b_lo, b_hi);
-}
-
-static void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-  int32x4_t c_lo[32];
-  int32x4_t c_hi[32];
-  int32x4_t d_lo[32];
-  int32x4_t d_hi[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  b[0] = vaddq_s16(a[0], a[15]);
-  b[1] = vaddq_s16(a[1], a[14]);
-  b[2] = vaddq_s16(a[2], a[13]);
-  b[3] = vaddq_s16(a[3], a[12]);
-  b[4] = vaddq_s16(a[4], a[11]);
-  b[5] = vaddq_s16(a[5], a[10]);
-  b[6] = vaddq_s16(a[6], a[9]);
-  b[7] = vaddq_s16(a[7], a[8]);
-
-  b[8] = vsubq_s16(a[7], a[8]);
-  b[9] = vsubq_s16(a[6], a[9]);
-  b[10] = vsubq_s16(a[5], a[10]);
-  b[11] = vsubq_s16(a[4], a[11]);
-  b[12] = vsubq_s16(a[3], a[12]);
-  b[13] = vsubq_s16(a[2], a[13]);
-  b[14] = vsubq_s16(a[1], a[14]);
-  b[15] = vsubq_s16(a[0], a[15]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-  b[18] = a[18];
-  b[19] = a[19];
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-
-  b[28] = a[28];
-  b[29] = a[29];
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 3. With extreme values for input this calculation rolls over int16_t.
-  // The sources for b[0] get added multiple times and, through testing, have
-  // been shown to overflow starting here.
-  ADD_S16_S32(b, 0, 7, c, 0);
-  ADD_S16_S32(b, 1, 6, c, 1);
-  ADD_S16_S32(b, 2, 5, c, 2);
-  ADD_S16_S32(b, 3, 4, c, 3);
-  SUB_S16_S32(b, 3, 4, c, 4);
-  SUB_S16_S32(b, 2, 5, c, 5);
-  SUB_S16_S32(b, 1, 6, c, 6);
-  SUB_S16_S32(b, 0, 7, c, 7);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
-  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  ADD_S16_S32(b, 16, 23, c, 16);
-  ADD_S16_S32(b, 17, 22, c, 17);
-  ADD_S16_S32(b, 18, 21, c, 18);
-  ADD_S16_S32(b, 19, 20, c, 19);
-  SUB_S16_S32(b, 19, 20, c, 20);
-  SUB_S16_S32(b, 18, 21, c, 21);
-  SUB_S16_S32(b, 17, 22, c, 22);
-  SUB_S16_S32(b, 16, 23, c, 23);
-  SUB_S16_S32(b, 31, 24, c, 24);
-  SUB_S16_S32(b, 30, 25, c, 25);
-  SUB_S16_S32(b, 29, 26, c, 26);
-  SUB_S16_S32(b, 28, 27, c, 27);
-  ADD_S16_S32(b, 28, 27, c, 28);
-  ADD_S16_S32(b, 29, 26, c, 29);
-  ADD_S16_S32(b, 30, 25, c, 30);
-  ADD_S16_S32(b, 31, 24, c, 31);
-
-  // Stage 4.
-  ADD_S32(c, 0, 3, d, 0);
-  ADD_S32(c, 1, 2, d, 1);
-  SUB_S32(c, 1, 2, d, 2);
-  SUB_S32(c, 0, 3, d, 3);
-
-  PASS_THROUGH(c, d, 4);
-
-  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
-
-  PASS_THROUGH(c, d, 7);
-
-  ADDW_S16_S32(c, 11, a, 8, d, 8);
-  ADDW_S16_S32(c, 10, a, 9, d, 9);
-  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
-  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
-  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
-  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
-  ADDW_S16_S32(c, 13, b, 14, d, 14);
-  ADDW_S16_S32(c, 12, b, 15, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 17);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_24_64, cospi_8_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_24_64, cospi_8_64, d, 28, 19);
-  BUTTERFLY_TWO_S32(c, 27, 20, -cospi_8_64, cospi_24_64, d, 27, 20);
-  BUTTERFLY_TWO_S32(c, 26, 21, -cospi_8_64, cospi_24_64, d, 26, 21);
-
-  PASS_THROUGH(c, d, 22);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 25);
-
-  PASS_THROUGH(c, d, 30);
-  PASS_THROUGH(c, d, 31);
-
-  // Stage 5.
-  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
-  BUTTERFLY_TWO_S32(d, 3, 2, cospi_24_64, cospi_8_64, c, 2, 3);
-
-  ADD_S32(d, 4, 5, c, 4);
-  SUB_S32(d, 4, 5, c, 5);
-  SUB_S32(d, 7, 6, c, 6);
-  ADD_S32(d, 7, 6, c, 7);
-
-  PASS_THROUGH(d, c, 8);
-
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_24_64, cospi_8_64, c, 14, 9);
-  BUTTERFLY_TWO_S32(d, 13, 10, -cospi_8_64, cospi_24_64, c, 13, 10);
-
-  PASS_THROUGH(d, c, 11);
-  PASS_THROUGH(d, c, 12);
-  PASS_THROUGH(d, c, 15);
-
-  ADD_S32(d, 16, 19, c, 16);
-  ADD_S32(d, 17, 18, c, 17);
-  SUB_S32(d, 17, 18, c, 18);
-  SUB_S32(d, 16, 19, c, 19);
-  SUB_S32(d, 23, 20, c, 20);
-  SUB_S32(d, 22, 21, c, 21);
-  ADD_S32(d, 22, 21, c, 22);
-  ADD_S32(d, 23, 20, c, 23);
-  ADD_S32(d, 24, 27, c, 24);
-  ADD_S32(d, 25, 26, c, 25);
-  SUB_S32(d, 25, 26, c, 26);
-  SUB_S32(d, 24, 27, c, 27);
-  SUB_S32(d, 31, 28, c, 28);
-  SUB_S32(d, 30, 29, c, 29);
-  ADD_S32(d, 30, 29, c, 30);
-  ADD_S32(d, 31, 28, c, 31);
-
-  // Stage 6.
-  PASS_THROUGH(c, d, 0);
-  PASS_THROUGH(c, d, 1);
-  PASS_THROUGH(c, d, 2);
-  PASS_THROUGH(c, d, 3);
-
-  BUTTERFLY_TWO_S32(c, 7, 4, cospi_28_64, cospi_4_64, d, 4, 7);
-  BUTTERFLY_TWO_S32(c, 6, 5, cospi_12_64, cospi_20_64, d, 5, 6);
-
-  ADD_S32(c, 8, 9, d, 8);
-  SUB_S32(c, 8, 9, d, 9);
-  SUB_S32(c, 11, 10, d, 10);
-  ADD_S32(c, 11, 10, d, 11);
-  ADD_S32(c, 12, 13, d, 12);
-  SUB_S32(c, 12, 13, d, 13);
-  SUB_S32(c, 15, 14, d, 14);
-  ADD_S32(c, 15, 14, d, 15);
-
-  PASS_THROUGH(c, d, 16);
-  PASS_THROUGH(c, d, 19);
-  PASS_THROUGH(c, d, 20);
-  PASS_THROUGH(c, d, 23);
-  PASS_THROUGH(c, d, 24);
-  PASS_THROUGH(c, d, 27);
-  PASS_THROUGH(c, d, 28);
-  PASS_THROUGH(c, d, 31);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_28_64, cospi_4_64, d, 30, 17);
-  BUTTERFLY_TWO_S32(c, 29, 18, -cospi_4_64, cospi_28_64, d, 29, 18);
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_12_64, cospi_20_64, d, 26, 21);
-  BUTTERFLY_TWO_S32(c, 25, 22, -cospi_20_64, cospi_12_64, d, 25, 22);
-
-  // Stage 7.
-  PASS_THROUGH(d, c, 0);
-  PASS_THROUGH(d, c, 1);
-  PASS_THROUGH(d, c, 2);
-  PASS_THROUGH(d, c, 3);
-  PASS_THROUGH(d, c, 4);
-  PASS_THROUGH(d, c, 5);
-  PASS_THROUGH(d, c, 6);
-  PASS_THROUGH(d, c, 7);
-
-  BUTTERFLY_TWO_S32(d, 15, 8, cospi_30_64, cospi_2_64, c, 8, 15);
-  BUTTERFLY_TWO_S32(d, 14, 9, cospi_14_64, cospi_18_64, c, 9, 14);
-  BUTTERFLY_TWO_S32(d, 13, 10, cospi_22_64, cospi_10_64, c, 10, 13);
-  BUTTERFLY_TWO_S32(d, 12, 11, cospi_6_64, cospi_26_64, c, 11, 12);
-
-  ADD_S32(d, 16, 17, c, 16);
-  SUB_S32(d, 16, 17, c, 17);
-  SUB_S32(d, 19, 18, c, 18);
-  ADD_S32(d, 19, 18, c, 19);
-  ADD_S32(d, 20, 21, c, 20);
-  SUB_S32(d, 20, 21, c, 21);
-  SUB_S32(d, 23, 22, c, 22);
-  ADD_S32(d, 23, 22, c, 23);
-  ADD_S32(d, 24, 25, c, 24);
-  SUB_S32(d, 24, 25, c, 25);
-  SUB_S32(d, 27, 26, c, 26);
-  ADD_S32(d, 27, 26, c, 27);
-  ADD_S32(d, 28, 29, c, 28);
-  SUB_S32(d, 28, 29, c, 29);
-  SUB_S32(d, 31, 30, c, 30);
-  ADD_S32(d, 31, 30, c, 31);
-
-  // Final stage.
-  // Roll rounding into this function so we can pass back int16x8.
-
-  out[0] = add_round_shift_s32(c_lo[0], c_hi[0]);
-  out[16] = add_round_shift_s32(c_lo[1], c_hi[1]);
-
-  out[8] = add_round_shift_s32(c_lo[2], c_hi[2]);
-  out[24] = add_round_shift_s32(c_lo[3], c_hi[3]);
-  out[4] = add_round_shift_s32(c_lo[4], c_hi[4]);
-  out[20] = add_round_shift_s32(c_lo[5], c_hi[5]);
-  out[12] = add_round_shift_s32(c_lo[6], c_hi[6]);
-
-  out[28] = add_round_shift_s32(c_lo[7], c_hi[7]);
-  out[2] = add_round_shift_s32(c_lo[8], c_hi[8]);
-  out[18] = add_round_shift_s32(c_lo[9], c_hi[9]);
-  out[10] = add_round_shift_s32(c_lo[10], c_hi[10]);
-
-  out[26] = add_round_shift_s32(c_lo[11], c_hi[11]);
-  out[6] = add_round_shift_s32(c_lo[12], c_hi[12]);
-  out[22] = add_round_shift_s32(c_lo[13], c_hi[13]);
-  out[14] = add_round_shift_s32(c_lo[14], c_hi[14]);
-  out[30] = add_round_shift_s32(c_lo[15], c_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 31, 16, cospi_31_64, cospi_1_64, d, 1, 31);
-  out[1] = add_round_shift_s32(d_lo[1], d_hi[1]);
-  out[31] = add_round_shift_s32(d_lo[31], d_hi[31]);
-
-  BUTTERFLY_TWO_S32(c, 30, 17, cospi_15_64, cospi_17_64, d, 17, 15);
-  out[17] = add_round_shift_s32(d_lo[17], d_hi[17]);
-  out[15] = add_round_shift_s32(d_lo[15], d_hi[15]);
-
-  BUTTERFLY_TWO_S32(c, 29, 18, cospi_23_64, cospi_9_64, d, 9, 23);
-  out[9] = add_round_shift_s32(d_lo[9], d_hi[9]);
-  out[23] = add_round_shift_s32(d_lo[23], d_hi[23]);
-
-  BUTTERFLY_TWO_S32(c, 28, 19, cospi_7_64, cospi_25_64, d, 25, 7);
-  out[25] = add_round_shift_s32(d_lo[25], d_hi[25]);
-  out[7] = add_round_shift_s32(d_lo[7], d_hi[7]);
-
-  BUTTERFLY_TWO_S32(c, 27, 20, cospi_27_64, cospi_5_64, d, 5, 27);
-  out[5] = add_round_shift_s32(d_lo[5], d_hi[5]);
-  out[27] = add_round_shift_s32(d_lo[27], d_hi[27]);
-
-  BUTTERFLY_TWO_S32(c, 26, 21, cospi_11_64, cospi_21_64, d, 21, 11);
-  out[21] = add_round_shift_s32(d_lo[21], d_hi[21]);
-  out[11] = add_round_shift_s32(d_lo[11], d_hi[11]);
-
-  BUTTERFLY_TWO_S32(c, 25, 22, cospi_19_64, cospi_13_64, d, 13, 19);
-  out[13] = add_round_shift_s32(d_lo[13], d_hi[13]);
-  out[19] = add_round_shift_s32(d_lo[19], d_hi[19]);
-
-  BUTTERFLY_TWO_S32(c, 24, 23, cospi_3_64, cospi_29_64, d, 29, 3);
-  out[29] = add_round_shift_s32(d_lo[29], d_hi[29]);
-  out[3] = add_round_shift_s32(d_lo[3], d_hi[3]);
-}
-
-// Add 1 if positive, 2 if negative, and shift by 2.
-// In practice, add 1, then add the sign bit, then shift without rounding.
-static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
-  const int16x8_t one = vdupq_n_s16(1);
-  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
-  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
-  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
-  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
-}
-
-static void dct_body_second_pass_rd(const int16x8_t *in, int16x8_t *out) {
-  int16x8_t a[32];
-  int16x8_t b[32];
-
-  // Stage 1. Done as part of the load for the first pass.
-  a[0] = vaddq_s16(in[0], in[31]);
-  a[1] = vaddq_s16(in[1], in[30]);
-  a[2] = vaddq_s16(in[2], in[29]);
-  a[3] = vaddq_s16(in[3], in[28]);
-  a[4] = vaddq_s16(in[4], in[27]);
-  a[5] = vaddq_s16(in[5], in[26]);
-  a[6] = vaddq_s16(in[6], in[25]);
-  a[7] = vaddq_s16(in[7], in[24]);
-  a[8] = vaddq_s16(in[8], in[23]);
-  a[9] = vaddq_s16(in[9], in[22]);
-  a[10] = vaddq_s16(in[10], in[21]);
-  a[11] = vaddq_s16(in[11], in[20]);
-  a[12] = vaddq_s16(in[12], in[19]);
-  a[13] = vaddq_s16(in[13], in[18]);
-  a[14] = vaddq_s16(in[14], in[17]);
-  a[15] = vaddq_s16(in[15], in[16]);
-  a[16] = vsubq_s16(in[15], in[16]);
-  a[17] = vsubq_s16(in[14], in[17]);
-  a[18] = vsubq_s16(in[13], in[18]);
-  a[19] = vsubq_s16(in[12], in[19]);
-  a[20] = vsubq_s16(in[11], in[20]);
-  a[21] = vsubq_s16(in[10], in[21]);
-  a[22] = vsubq_s16(in[9], in[22]);
-  a[23] = vsubq_s16(in[8], in[23]);
-  a[24] = vsubq_s16(in[7], in[24]);
-  a[25] = vsubq_s16(in[6], in[25]);
-  a[26] = vsubq_s16(in[5], in[26]);
-  a[27] = vsubq_s16(in[4], in[27]);
-  a[28] = vsubq_s16(in[3], in[28]);
-  a[29] = vsubq_s16(in[2], in[29]);
-  a[30] = vsubq_s16(in[1], in[30]);
-  a[31] = vsubq_s16(in[0], in[31]);
-
-  // Stage 2.
-  // For the "rd" version, all the values are rounded down after stage 2 to keep
-  // the values in 16 bits.
-  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
-  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
-  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
-  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
-  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
-  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
-  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
-  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
-
-  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
-  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
-  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
-  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
-  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
-  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
-  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
-  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
-
-  b[16] = add_round_shift_s16(a[16]);
-  b[17] = add_round_shift_s16(a[17]);
-  b[18] = add_round_shift_s16(a[18]);
-  b[19] = add_round_shift_s16(a[19]);
-
-  butterfly_one_coeff(a[27], a[20], cospi_16_64, &b[27], &b[20]);
-  butterfly_one_coeff(a[26], a[21], cospi_16_64, &b[26], &b[21]);
-  butterfly_one_coeff(a[25], a[22], cospi_16_64, &b[25], &b[22]);
-  butterfly_one_coeff(a[24], a[23], cospi_16_64, &b[24], &b[23]);
-  b[20] = add_round_shift_s16(b[20]);
-  b[21] = add_round_shift_s16(b[21]);
-  b[22] = add_round_shift_s16(b[22]);
-  b[23] = add_round_shift_s16(b[23]);
-  b[24] = add_round_shift_s16(b[24]);
-  b[25] = add_round_shift_s16(b[25]);
-  b[26] = add_round_shift_s16(b[26]);
-  b[27] = add_round_shift_s16(b[27]);
-
-  b[28] = add_round_shift_s16(a[28]);
-  b[29] = add_round_shift_s16(a[29]);
-  b[30] = add_round_shift_s16(a[30]);
-  b[31] = add_round_shift_s16(a[31]);
-
-  // Stage 3.
-  a[0] = vaddq_s16(b[0], b[7]);
-  a[1] = vaddq_s16(b[1], b[6]);
-  a[2] = vaddq_s16(b[2], b[5]);
-  a[3] = vaddq_s16(b[3], b[4]);
-
-  a[4] = vsubq_s16(b[3], b[4]);
-  a[5] = vsubq_s16(b[2], b[5]);
-  a[6] = vsubq_s16(b[1], b[6]);
-  a[7] = vsubq_s16(b[0], b[7]);
-
-  a[8] = b[8];
-  a[9] = b[9];
-
-  butterfly_one_coeff(b[13], b[10], cospi_16_64, &a[13], &a[10]);
-  butterfly_one_coeff(b[12], b[11], cospi_16_64, &a[12], &a[11]);
-
-  a[14] = b[14];
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[16], b[23]);
-  a[17] = vaddq_s16(b[17], b[22]);
-  a[18] = vaddq_s16(b[18], b[21]);
-  a[19] = vaddq_s16(b[19], b[20]);
-
-  a[20] = vsubq_s16(b[19], b[20]);
-  a[21] = vsubq_s16(b[18], b[21]);
-  a[22] = vsubq_s16(b[17], b[22]);
-  a[23] = vsubq_s16(b[16], b[23]);
-
-  a[24] = vsubq_s16(b[31], b[24]);
-  a[25] = vsubq_s16(b[30], b[25]);
-  a[26] = vsubq_s16(b[29], b[26]);
-  a[27] = vsubq_s16(b[28], b[27]);
-
-  a[28] = vaddq_s16(b[28], b[27]);
-  a[29] = vaddq_s16(b[29], b[26]);
-  a[30] = vaddq_s16(b[30], b[25]);
-  a[31] = vaddq_s16(b[31], b[24]);
-
-  // Stage 4.
-  b[0] = vaddq_s16(a[0], a[3]);
-  b[1] = vaddq_s16(a[1], a[2]);
-  b[2] = vsubq_s16(a[1], a[2]);
-  b[3] = vsubq_s16(a[0], a[3]);
-
-  b[4] = a[4];
-
-  butterfly_one_coeff(a[6], a[5], cospi_16_64, &b[6], &b[5]);
-
-  b[7] = a[7];
-
-  b[8] = vaddq_s16(a[8], a[11]);
-  b[9] = vaddq_s16(a[9], a[10]);
-  b[10] = vsubq_s16(a[9], a[10]);
-  b[11] = vsubq_s16(a[8], a[11]);
-  b[12] = vsubq_s16(a[15], a[12]);
-  b[13] = vsubq_s16(a[14], a[13]);
-  b[14] = vaddq_s16(a[14], a[13]);
-  b[15] = vaddq_s16(a[15], a[12]);
-
-  b[16] = a[16];
-  b[17] = a[17];
-
-  butterfly_two_coeff(a[29], a[18], cospi_24_64, cospi_8_64, &b[29], &b[18]);
-  butterfly_two_coeff(a[28], a[19], cospi_24_64, cospi_8_64, &b[28], &b[19]);
-  butterfly_two_coeff(a[27], a[20], -cospi_8_64, cospi_24_64, &b[27], &b[20]);
-  butterfly_two_coeff(a[26], a[21], -cospi_8_64, cospi_24_64, &b[26], &b[21]);
-
-  b[22] = a[22];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[25] = a[25];
-
-  b[30] = a[30];
-  b[31] = a[31];
-
-  // Stage 5.
-  butterfly_one_coeff(b[0], b[1], cospi_16_64, &a[0], &a[1]);
-  butterfly_two_coeff(b[3], b[2], cospi_24_64, cospi_8_64, &a[2], &a[3]);
-
-  a[4] = vaddq_s16(b[4], b[5]);
-  a[5] = vsubq_s16(b[4], b[5]);
-  a[6] = vsubq_s16(b[7], b[6]);
-  a[7] = vaddq_s16(b[7], b[6]);
-
-  a[8] = b[8];
-
-  butterfly_two_coeff(b[14], b[9], cospi_24_64, cospi_8_64, &a[14], &a[9]);
-  butterfly_two_coeff(b[13], b[10], -cospi_8_64, cospi_24_64, &a[13], &a[10]);
-
-  a[11] = b[11];
-  a[12] = b[12];
-
-  a[15] = b[15];
-
-  a[16] = vaddq_s16(b[19], b[16]);
-  a[17] = vaddq_s16(b[18], b[17]);
-  a[18] = vsubq_s16(b[17], b[18]);
-  a[19] = vsubq_s16(b[16], b[19]);
-  a[20] = vsubq_s16(b[23], b[20]);
-  a[21] = vsubq_s16(b[22], b[21]);
-  a[22] = vaddq_s16(b[21], b[22]);
-  a[23] = vaddq_s16(b[20], b[23]);
-  a[24] = vaddq_s16(b[27], b[24]);
-  a[25] = vaddq_s16(b[26], b[25]);
-  a[26] = vsubq_s16(b[25], b[26]);
-  a[27] = vsubq_s16(b[24], b[27]);
-  a[28] = vsubq_s16(b[31], b[28]);
-  a[29] = vsubq_s16(b[30], b[29]);
-  a[30] = vaddq_s16(b[29], b[30]);
-  a[31] = vaddq_s16(b[28], b[31]);
-
-  // Stage 6.
-  b[0] = a[0];
-  b[1] = a[1];
-  b[2] = a[2];
-  b[3] = a[3];
-
-  butterfly_two_coeff(a[7], a[4], cospi_28_64, cospi_4_64, &b[4], &b[7]);
-  butterfly_two_coeff(a[6], a[5], cospi_12_64, cospi_20_64, &b[5], &b[6]);
-
-  b[8] = vaddq_s16(a[8], a[9]);
-  b[9] = vsubq_s16(a[8], a[9]);
-  b[10] = vsubq_s16(a[11], a[10]);
-  b[11] = vaddq_s16(a[11], a[10]);
-  b[12] = vaddq_s16(a[12], a[13]);
-  b[13] = vsubq_s16(a[12], a[13]);
-  b[14] = vsubq_s16(a[15], a[14]);
-  b[15] = vaddq_s16(a[15], a[14]);
-
-  b[16] = a[16];
-  b[19] = a[19];
-  b[20] = a[20];
-  b[23] = a[23];
-  b[24] = a[24];
-  b[27] = a[27];
-  b[28] = a[28];
-  b[31] = a[31];
-
-  butterfly_two_coeff(a[30], a[17], cospi_28_64, cospi_4_64, &b[30], &b[17]);
-  butterfly_two_coeff(a[29], a[18], -cospi_4_64, cospi_28_64, &b[29], &b[18]);
-
-  butterfly_two_coeff(a[26], a[21], cospi_12_64, cospi_20_64, &b[26], &b[21]);
-  butterfly_two_coeff(a[25], a[22], -cospi_20_64, cospi_12_64, &b[25], &b[22]);
-
-  // Stage 7.
-  a[0] = b[0];
-  a[1] = b[1];
-  a[2] = b[2];
-  a[3] = b[3];
-  a[4] = b[4];
-  a[5] = b[5];
-  a[6] = b[6];
-  a[7] = b[7];
-
-  butterfly_two_coeff(b[15], b[8], cospi_30_64, cospi_2_64, &a[8], &a[15]);
-  butterfly_two_coeff(b[14], b[9], cospi_14_64, cospi_18_64, &a[9], &a[14]);
-  butterfly_two_coeff(b[13], b[10], cospi_22_64, cospi_10_64, &a[10], &a[13]);
-  butterfly_two_coeff(b[12], b[11], cospi_6_64, cospi_26_64, &a[11], &a[12]);
-
-  a[16] = vaddq_s16(b[16], b[17]);
-  a[17] = vsubq_s16(b[16], b[17]);
-  a[18] = vsubq_s16(b[19], b[18]);
-  a[19] = vaddq_s16(b[19], b[18]);
-  a[20] = vaddq_s16(b[20], b[21]);
-  a[21] = vsubq_s16(b[20], b[21]);
-  a[22] = vsubq_s16(b[23], b[22]);
-  a[23] = vaddq_s16(b[23], b[22]);
-  a[24] = vaddq_s16(b[24], b[25]);
-  a[25] = vsubq_s16(b[24], b[25]);
-  a[26] = vsubq_s16(b[27], b[26]);
-  a[27] = vaddq_s16(b[27], b[26]);
-  a[28] = vaddq_s16(b[28], b[29]);
-  a[29] = vsubq_s16(b[28], b[29]);
-  a[30] = vsubq_s16(b[31], b[30]);
-  a[31] = vaddq_s16(b[31], b[30]);
-
-  // Final stage.
-  out[0] = a[0];
-  out[16] = a[1];
-  out[8] = a[2];
-  out[24] = a[3];
-  out[4] = a[4];
-  out[20] = a[5];
-  out[12] = a[6];
-  out[28] = a[7];
-  out[2] = a[8];
-  out[18] = a[9];
-  out[10] = a[10];
-  out[26] = a[11];
-  out[6] = a[12];
-  out[22] = a[13];
-  out[14] = a[14];
-  out[30] = a[15];
-
-  butterfly_two_coeff(a[31], a[16], cospi_31_64, cospi_1_64, &out[1], &out[31]);
-  butterfly_two_coeff(a[30], a[17], cospi_15_64, cospi_17_64, &out[17],
-                      &out[15]);
-  butterfly_two_coeff(a[29], a[18], cospi_23_64, cospi_9_64, &out[9], &out[23]);
-  butterfly_two_coeff(a[28], a[19], cospi_7_64, cospi_25_64, &out[25], &out[7]);
-  butterfly_two_coeff(a[27], a[20], cospi_27_64, cospi_5_64, &out[5], &out[27]);
-  butterfly_two_coeff(a[26], a[21], cospi_11_64, cospi_21_64, &out[21],
-                      &out[11]);
-  butterfly_two_coeff(a[25], a[22], cospi_19_64, cospi_13_64, &out[13],
-                      &out[19]);
-  butterfly_two_coeff(a[24], a[23], cospi_3_64, cospi_29_64, &out[29], &out[3]);
-}
-
-#undef PASS_THROUGH
-#undef ADD_S16_S32
-#undef SUB_S16_S32
-#undef ADDW_S16_S32
-#undef SUBW_S16_S32
-#undef ADD_S32
-#undef SUB_S32
-#undef BUTTERFLY_ONE_S16_S32
-#undef BUTTERFLY_ONE_S32
-#undef BUTTERFLY_TWO_S32
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-// TODO(johannkoenig): share with other fdcts.
-static INLINE void transpose_8x8(const int16x8_t *a, int16x8_t *b) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
 void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[32];
   int16x8_t temp1[32];
@@ -1324,23 +43,27 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1355,10 +78,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1373,10 +96,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1391,10 +114,10 @@ void vpx_fdct32x32_neon(const int16_t *input, tran_low_t *output, int stride) {
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
 
   dct_body_second_pass(temp0, temp5);
 
@@ -1419,23 +142,27 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   int16x8_t temp5[32];
 
   // Process in 8x32 columns.
-  load(input, stride, temp0);
-  dct_body_first_pass(temp0, temp1);
+  load_cross(input, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp1);
 
-  load(input + 8, stride, temp0);
-  dct_body_first_pass(temp0, temp2);
+  load_cross(input + 8, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp2);
 
-  load(input + 16, stride, temp0);
-  dct_body_first_pass(temp0, temp3);
+  load_cross(input + 16, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp3);
 
-  load(input + 24, stride, temp0);
-  dct_body_first_pass(temp0, temp4);
+  load_cross(input + 24, stride, temp0);
+  scale_input(temp0, temp5);
+  dct_body_first_pass(temp5, temp4);
 
   // Generate the top row by munging the first set of 8 from each one together.
-  transpose_8x8(&temp1[0], &temp0[0]);
-  transpose_8x8(&temp2[0], &temp0[8]);
-  transpose_8x8(&temp3[0], &temp0[16]);
-  transpose_8x8(&temp4[0], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[0], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[0], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[0], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[0], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1450,10 +177,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output, temp5);
 
   // Second row of 8x32.
-  transpose_8x8(&temp1[8], &temp0[0]);
-  transpose_8x8(&temp2[8], &temp0[8]);
-  transpose_8x8(&temp3[8], &temp0[16]);
-  transpose_8x8(&temp4[8], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[8], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[8], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[8], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[8], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1468,10 +195,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 8 * 32, temp5);
 
   // Third row of 8x32
-  transpose_8x8(&temp1[16], &temp0[0]);
-  transpose_8x8(&temp2[16], &temp0[8]);
-  transpose_8x8(&temp3[16], &temp0[16]);
-  transpose_8x8(&temp4[16], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[16], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[16], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[16], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[16], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1486,10 +213,10 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
   store(output + 16 * 32, temp5);
 
   // Final row of 8x32.
-  transpose_8x8(&temp1[24], &temp0[0]);
-  transpose_8x8(&temp2[24], &temp0[8]);
-  transpose_8x8(&temp3[24], &temp0[16]);
-  transpose_8x8(&temp4[24], &temp0[24]);
+  transpose_s16_8x8_new(&temp1[24], &temp0[0]);
+  transpose_s16_8x8_new(&temp2[24], &temp0[8]);
+  transpose_s16_8x8_new(&temp3[24], &temp0[16]);
+  transpose_s16_8x8_new(&temp4[24], &temp0[24]);
 
   dct_body_second_pass_rd(temp0, temp5);
 
@@ -1503,5 +230,190 @@ void vpx_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
                     &temp5[29], &temp5[30], &temp5[31]);
   store(output + 24 * 32, temp5);
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct32x32_neon(const int16_t *input, tran_low_t *output,
+                               int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass(left5, right5);
+  highbd_partial_add_round_shift(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass(left6, right6);
+  highbd_partial_add_round_shift(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass(left7, right7);
+  highbd_partial_add_round_shift(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass(left8, right8);
+  highbd_partial_add_round_shift(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+void vpx_highbd_fdct32x32_rd_neon(const int16_t *input, tran_low_t *output,
+                                  int stride) {
+  int16x8_t temp0[32];
+  int32x4_t left1[32], left2[32], left3[32], left4[32], right1[32], right2[32],
+      right3[32], right4[32];
+  int32x4_t left5[32], right5[32], left6[32], right6[32], left7[32], right7[32],
+      left8[32], right8[32];
+  int32x4_t temp1[32], temp2[32];
+
+  // Process in 8x32 columns.
+  load_cross(input, stride, temp0);
+  highbd_scale_input(temp0, left1, right1);
+  highbd_dct8x32_body_first_pass(left1, right1);
+  highbd_partial_sub_round_shift(left1, right1);
+
+  load_cross(input + 8, stride, temp0);
+  highbd_scale_input(temp0, left2, right2);
+  highbd_dct8x32_body_first_pass(left2, right2);
+  highbd_partial_sub_round_shift(left2, right2);
+
+  load_cross(input + 16, stride, temp0);
+  highbd_scale_input(temp0, left3, right3);
+  highbd_dct8x32_body_first_pass(left3, right3);
+  highbd_partial_sub_round_shift(left3, right3);
+
+  load_cross(input + 24, stride, temp0);
+  highbd_scale_input(temp0, left4, right4);
+  highbd_dct8x32_body_first_pass(left4, right4);
+  highbd_partial_sub_round_shift(left4, right4);
+
+  // Generate the top row by munging the first set of 8 from each one together.
+  transpose_s32_8x8_2(left1, right1, temp1, temp2);
+  transpose_s32_8x8_2(left2, right2, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3, right3, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4, right4, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left5, right5);
+  highbd_dct8x32_body_second_pass_rd(left5, right5);
+
+  // Second row of 8x32.
+  transpose_s32_8x8_2(left1 + 8, right1 + 8, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 8, right2 + 8, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 8, right3 + 8, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 8, right4 + 8, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left6, right6);
+  highbd_dct8x32_body_second_pass_rd(left6, right6);
+
+  // Third row of 8x32
+  transpose_s32_8x8_2(left1 + 16, right1 + 16, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 16, right2 + 16, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 16, right3 + 16, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 16, right4 + 16, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left7, right7);
+  highbd_dct8x32_body_second_pass_rd(left7, right7);
+
+  // Final row of 8x32.
+  transpose_s32_8x8_2(left1 + 24, right1 + 24, temp1, temp2);
+  transpose_s32_8x8_2(left2 + 24, right2 + 24, temp1 + 8, temp2 + 8);
+  transpose_s32_8x8_2(left3 + 24, right3 + 24, temp1 + 16, temp2 + 16);
+  transpose_s32_8x8_2(left4 + 24, right4 + 24, temp1 + 24, temp2 + 24);
+
+  highbd_cross_input(temp1, temp2, left8, right8);
+  highbd_dct8x32_body_second_pass_rd(left8, right8);
+
+  // Final transpose
+  transpose_s32_8x8_2(left5, right5, left1, right1);
+  transpose_s32_8x8_2(left5 + 8, right5 + 8, left2, right2);
+  transpose_s32_8x8_2(left5 + 16, right5 + 16, left3, right3);
+  transpose_s32_8x8_2(left5 + 24, right5 + 24, left4, right4);
+  transpose_s32_8x8_2(left6, right6, left1 + 8, right1 + 8);
+  transpose_s32_8x8_2(left6 + 8, right6 + 8, left2 + 8, right2 + 8);
+  transpose_s32_8x8_2(left6 + 16, right6 + 16, left3 + 8, right3 + 8);
+  transpose_s32_8x8_2(left6 + 24, right6 + 24, left4 + 8, right4 + 8);
+  transpose_s32_8x8_2(left7, right7, left1 + 16, right1 + 16);
+  transpose_s32_8x8_2(left7 + 8, right7 + 8, left2 + 16, right2 + 16);
+  transpose_s32_8x8_2(left7 + 16, right7 + 16, left3 + 16, right3 + 16);
+  transpose_s32_8x8_2(left7 + 24, right7 + 24, left4 + 16, right4 + 16);
+  transpose_s32_8x8_2(left8, right8, left1 + 24, right1 + 24);
+  transpose_s32_8x8_2(left8 + 8, right8 + 8, left2 + 24, right2 + 24);
+  transpose_s32_8x8_2(left8 + 16, right8 + 16, left3 + 24, right3 + 24);
+  transpose_s32_8x8_2(left8 + 24, right8 + 24, left4 + 24, right4 + 24);
+
+  store32x32_s32(output, left1, right1, left2, right2, left3, right3, left4,
+                 right4);
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !defined(__clang__) && !defined(__ANDROID__) && defined(__GNUC__) &&
         // __GNUC__ == 4 && __GNUC_MINOR__ <= 9
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.h b/libvpx/vpx_dsp/arm/fdct32x32_neon.h
new file mode 100644
index 000000000..3b9e64c6d
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.h
@@ -0,0 +1,2919 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+// Load & cross the first 8 and last 8, then the middle
+static INLINE void load_cross(const int16_t *a, int stride, int16x8_t *b) {
+  b[0] = vaddq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+  b[1] = vaddq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[2] = vaddq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[3] = vaddq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[4] = vaddq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[5] = vaddq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[6] = vaddq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[7] = vaddq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+
+  b[24] = vsubq_s16(vld1q_s16(a + 7 * stride), vld1q_s16(a + 24 * stride));
+  b[25] = vsubq_s16(vld1q_s16(a + 6 * stride), vld1q_s16(a + 25 * stride));
+  b[26] = vsubq_s16(vld1q_s16(a + 5 * stride), vld1q_s16(a + 26 * stride));
+  b[27] = vsubq_s16(vld1q_s16(a + 4 * stride), vld1q_s16(a + 27 * stride));
+  b[28] = vsubq_s16(vld1q_s16(a + 3 * stride), vld1q_s16(a + 28 * stride));
+  b[29] = vsubq_s16(vld1q_s16(a + 2 * stride), vld1q_s16(a + 29 * stride));
+  b[30] = vsubq_s16(vld1q_s16(a + 1 * stride), vld1q_s16(a + 30 * stride));
+  b[31] = vsubq_s16(vld1q_s16(a + 0 * stride), vld1q_s16(a + 31 * stride));
+
+  b[8] = vaddq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+  b[9] = vaddq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[10] = vaddq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[11] = vaddq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[12] = vaddq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[13] = vaddq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[14] = vaddq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[15] = vaddq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+
+  b[16] = vsubq_s16(vld1q_s16(a + 15 * stride), vld1q_s16(a + 16 * stride));
+  b[17] = vsubq_s16(vld1q_s16(a + 14 * stride), vld1q_s16(a + 17 * stride));
+  b[18] = vsubq_s16(vld1q_s16(a + 13 * stride), vld1q_s16(a + 18 * stride));
+  b[19] = vsubq_s16(vld1q_s16(a + 12 * stride), vld1q_s16(a + 19 * stride));
+  b[20] = vsubq_s16(vld1q_s16(a + 11 * stride), vld1q_s16(a + 20 * stride));
+  b[21] = vsubq_s16(vld1q_s16(a + 10 * stride), vld1q_s16(a + 21 * stride));
+  b[22] = vsubq_s16(vld1q_s16(a + 9 * stride), vld1q_s16(a + 22 * stride));
+  b[23] = vsubq_s16(vld1q_s16(a + 8 * stride), vld1q_s16(a + 23 * stride));
+}
+
+#define STORE_S16(src, index, dest)           \
+  do {                                        \
+    store_s16q_to_tran_low(dest, src[index]); \
+    dest += 8;                                \
+  } while (0)
+
+// Store 32 16x8 values, assuming stride == 32.
+// Slight twist: store horizontally in blocks of 8.
+static INLINE void store(tran_low_t *a, const int16x8_t *b) {
+  STORE_S16(b, 0, a);
+  STORE_S16(b, 8, a);
+  STORE_S16(b, 16, a);
+  STORE_S16(b, 24, a);
+  STORE_S16(b, 1, a);
+  STORE_S16(b, 9, a);
+  STORE_S16(b, 17, a);
+  STORE_S16(b, 25, a);
+  STORE_S16(b, 2, a);
+  STORE_S16(b, 10, a);
+  STORE_S16(b, 18, a);
+  STORE_S16(b, 26, a);
+  STORE_S16(b, 3, a);
+  STORE_S16(b, 11, a);
+  STORE_S16(b, 19, a);
+  STORE_S16(b, 27, a);
+  STORE_S16(b, 4, a);
+  STORE_S16(b, 12, a);
+  STORE_S16(b, 20, a);
+  STORE_S16(b, 28, a);
+  STORE_S16(b, 5, a);
+  STORE_S16(b, 13, a);
+  STORE_S16(b, 21, a);
+  STORE_S16(b, 29, a);
+  STORE_S16(b, 6, a);
+  STORE_S16(b, 14, a);
+  STORE_S16(b, 22, a);
+  STORE_S16(b, 30, a);
+  STORE_S16(b, 7, a);
+  STORE_S16(b, 15, a);
+  STORE_S16(b, 23, a);
+  STORE_S16(b, 31, a);
+}
+
+#undef STORE_S16
+
+static INLINE void scale_input(const int16x8_t *in /*32*/,
+                               int16x8_t *out /*32*/) {
+  out[0] = vshlq_n_s16(in[0], 2);
+  out[1] = vshlq_n_s16(in[1], 2);
+  out[2] = vshlq_n_s16(in[2], 2);
+  out[3] = vshlq_n_s16(in[3], 2);
+  out[4] = vshlq_n_s16(in[4], 2);
+  out[5] = vshlq_n_s16(in[5], 2);
+  out[6] = vshlq_n_s16(in[6], 2);
+  out[7] = vshlq_n_s16(in[7], 2);
+
+  out[8] = vshlq_n_s16(in[8], 2);
+  out[9] = vshlq_n_s16(in[9], 2);
+  out[10] = vshlq_n_s16(in[10], 2);
+  out[11] = vshlq_n_s16(in[11], 2);
+  out[12] = vshlq_n_s16(in[12], 2);
+  out[13] = vshlq_n_s16(in[13], 2);
+  out[14] = vshlq_n_s16(in[14], 2);
+  out[15] = vshlq_n_s16(in[15], 2);
+
+  out[16] = vshlq_n_s16(in[16], 2);
+  out[17] = vshlq_n_s16(in[17], 2);
+  out[18] = vshlq_n_s16(in[18], 2);
+  out[19] = vshlq_n_s16(in[19], 2);
+  out[20] = vshlq_n_s16(in[20], 2);
+  out[21] = vshlq_n_s16(in[21], 2);
+  out[22] = vshlq_n_s16(in[22], 2);
+  out[23] = vshlq_n_s16(in[23], 2);
+
+  out[24] = vshlq_n_s16(in[24], 2);
+  out[25] = vshlq_n_s16(in[25], 2);
+  out[26] = vshlq_n_s16(in[26], 2);
+  out[27] = vshlq_n_s16(in[27], 2);
+  out[28] = vshlq_n_s16(in[28], 2);
+  out[29] = vshlq_n_s16(in[29], 2);
+  out[30] = vshlq_n_s16(in[30], 2);
+  out[31] = vshlq_n_s16(in[31], 2);
+}
+
+static INLINE void dct_body_first_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  a[0] = vaddq_s16(in[0], in[15]);
+  a[1] = vaddq_s16(in[1], in[14]);
+  a[2] = vaddq_s16(in[2], in[13]);
+  a[3] = vaddq_s16(in[3], in[12]);
+  a[4] = vaddq_s16(in[4], in[11]);
+  a[5] = vaddq_s16(in[5], in[10]);
+  a[6] = vaddq_s16(in[6], in[9]);
+  a[7] = vaddq_s16(in[7], in[8]);
+
+  a[8] = vsubq_s16(in[7], in[8]);
+  a[9] = vsubq_s16(in[6], in[9]);
+  a[10] = vsubq_s16(in[5], in[10]);
+  a[11] = vsubq_s16(in[4], in[11]);
+  a[12] = vsubq_s16(in[3], in[12]);
+  a[13] = vsubq_s16(in[2], in[13]);
+  a[14] = vsubq_s16(in[1], in[14]);
+  a[15] = vsubq_s16(in[0], in[15]);
+
+  a[16] = in[16];
+  a[17] = in[17];
+  a[18] = in[18];
+  a[19] = in[19];
+
+  butterfly_one_coeff_s16_s32_narrow(in[27], in[20], cospi_16_64, &a[27],
+                                     &a[20]);
+  butterfly_one_coeff_s16_s32_narrow(in[26], in[21], cospi_16_64, &a[26],
+                                     &a[21]);
+  butterfly_one_coeff_s16_s32_narrow(in[25], in[22], cospi_16_64, &a[25],
+                                     &a[22]);
+  butterfly_one_coeff_s16_s32_narrow(in[24], in[23], cospi_16_64, &a[24],
+                                     &a[23]);
+
+  a[28] = in[28];
+  a[29] = in[29];
+  a[30] = in[30];
+  a[31] = in[31];
+
+  // Stage 3.
+  b[0] = vaddq_s16(a[0], a[7]);
+  b[1] = vaddq_s16(a[1], a[6]);
+  b[2] = vaddq_s16(a[2], a[5]);
+  b[3] = vaddq_s16(a[3], a[4]);
+
+  b[4] = vsubq_s16(a[3], a[4]);
+  b[5] = vsubq_s16(a[2], a[5]);
+  b[6] = vsubq_s16(a[1], a[6]);
+  b[7] = vsubq_s16(a[0], a[7]);
+
+  b[8] = a[8];
+  b[9] = a[9];
+
+  butterfly_one_coeff_s16_s32_narrow(a[13], a[10], cospi_16_64, &b[13], &b[10]);
+  butterfly_one_coeff_s16_s32_narrow(a[12], a[11], cospi_16_64, &b[12], &b[11]);
+
+  b[14] = a[14];
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(in[16], a[23]);
+  b[17] = vaddq_s16(in[17], a[22]);
+  b[18] = vaddq_s16(in[18], a[21]);
+  b[19] = vaddq_s16(in[19], a[20]);
+
+  b[20] = vsubq_s16(in[19], a[20]);
+  b[21] = vsubq_s16(in[18], a[21]);
+  b[22] = vsubq_s16(in[17], a[22]);
+  b[23] = vsubq_s16(in[16], a[23]);
+
+  b[24] = vsubq_s16(in[31], a[24]);
+  b[25] = vsubq_s16(in[30], a[25]);
+  b[26] = vsubq_s16(in[29], a[26]);
+  b[27] = vsubq_s16(in[28], a[27]);
+
+  b[28] = vaddq_s16(in[28], a[27]);
+  b[29] = vaddq_s16(in[29], a[26]);
+  b[30] = vaddq_s16(in[30], a[25]);
+  b[31] = vaddq_s16(in[31], a[24]);
+
+  // Stage 4.
+  a[0] = vaddq_s16(b[0], b[3]);
+  a[1] = vaddq_s16(b[1], b[2]);
+  a[2] = vsubq_s16(b[1], b[2]);
+  a[3] = vsubq_s16(b[0], b[3]);
+
+  a[4] = b[4];
+
+  butterfly_one_coeff_s16_s32_narrow(b[6], b[5], cospi_16_64, &a[6], &a[5]);
+
+  a[7] = b[7];
+
+  a[8] = vaddq_s16(b[8], b[11]);
+  a[9] = vaddq_s16(b[9], b[10]);
+  a[10] = vsubq_s16(b[9], b[10]);
+  a[11] = vsubq_s16(b[8], b[11]);
+  a[12] = vsubq_s16(b[15], b[12]);
+  a[13] = vsubq_s16(b[14], b[13]);
+  a[14] = vaddq_s16(b[14], b[13]);
+  a[15] = vaddq_s16(b[15], b[12]);
+
+  a[16] = b[16];
+  a[17] = b[17];
+
+  butterfly_two_coeff(b[29], b[18], cospi_8_64, cospi_24_64, &a[29], &a[18]);
+  butterfly_two_coeff(b[28], b[19], cospi_8_64, cospi_24_64, &a[28], &a[19]);
+  butterfly_two_coeff(b[27], b[20], cospi_24_64, -cospi_8_64, &a[27], &a[20]);
+  butterfly_two_coeff(b[26], b[21], cospi_24_64, -cospi_8_64, &a[26], &a[21]);
+
+  a[22] = b[22];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[25] = b[25];
+
+  a[30] = b[30];
+  a[31] = b[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_fast(a[0], a[1], cospi_16_64, &b[0], &b[1]);
+  butterfly_two_coeff(a[3], a[2], cospi_8_64, cospi_24_64, &b[2], &b[3]);
+
+  b[4] = vaddq_s16(a[4], a[5]);
+  b[5] = vsubq_s16(a[4], a[5]);
+  b[6] = vsubq_s16(a[7], a[6]);
+  b[7] = vaddq_s16(a[7], a[6]);
+
+  b[8] = a[8];
+
+  butterfly_two_coeff(a[14], a[9], cospi_8_64, cospi_24_64, &b[14], &b[9]);
+  butterfly_two_coeff(a[13], a[10], cospi_24_64, -cospi_8_64, &b[13], &b[10]);
+
+  b[11] = a[11];
+  b[12] = a[12];
+
+  b[15] = a[15];
+
+  b[16] = vaddq_s16(a[19], a[16]);
+  b[17] = vaddq_s16(a[18], a[17]);
+  b[18] = vsubq_s16(a[17], a[18]);
+  b[19] = vsubq_s16(a[16], a[19]);
+  b[20] = vsubq_s16(a[23], a[20]);
+  b[21] = vsubq_s16(a[22], a[21]);
+  b[22] = vaddq_s16(a[21], a[22]);
+  b[23] = vaddq_s16(a[20], a[23]);
+  b[24] = vaddq_s16(a[27], a[24]);
+  b[25] = vaddq_s16(a[26], a[25]);
+  b[26] = vsubq_s16(a[25], a[26]);
+  b[27] = vsubq_s16(a[24], a[27]);
+  b[28] = vsubq_s16(a[31], a[28]);
+  b[29] = vsubq_s16(a[30], a[29]);
+  b[30] = vaddq_s16(a[29], a[30]);
+  b[31] = vaddq_s16(a[28], a[31]);
+
+  // Stage 6.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+
+  butterfly_two_coeff(b[7], b[4], cospi_4_64, cospi_28_64, &a[4], &a[7]);
+  butterfly_two_coeff(b[6], b[5], cospi_20_64, cospi_12_64, &a[5], &a[6]);
+
+  a[8] = vaddq_s16(b[8], b[9]);
+  a[9] = vsubq_s16(b[8], b[9]);
+  a[10] = vsubq_s16(b[11], b[10]);
+  a[11] = vaddq_s16(b[11], b[10]);
+  a[12] = vaddq_s16(b[12], b[13]);
+  a[13] = vsubq_s16(b[12], b[13]);
+  a[14] = vsubq_s16(b[15], b[14]);
+  a[15] = vaddq_s16(b[15], b[14]);
+
+  a[16] = b[16];
+  a[19] = b[19];
+  a[20] = b[20];
+  a[23] = b[23];
+  a[24] = b[24];
+  a[27] = b[27];
+  a[28] = b[28];
+  a[31] = b[31];
+
+  butterfly_two_coeff(b[30], b[17], cospi_4_64, cospi_28_64, &a[30], &a[17]);
+  butterfly_two_coeff(b[29], b[18], cospi_28_64, -cospi_4_64, &a[29], &a[18]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_20_64, cospi_12_64, &a[26], &a[21]);
+  butterfly_two_coeff(b[25], b[22], cospi_12_64, -cospi_20_64, &a[25], &a[22]);
+
+  // Stage 7.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+  b[4] = a[4];
+  b[5] = a[5];
+  b[6] = a[6];
+  b[7] = a[7];
+
+  butterfly_two_coeff(a[15], a[8], cospi_2_64, cospi_30_64, &b[8], &b[15]);
+  butterfly_two_coeff(a[14], a[9], cospi_18_64, cospi_14_64, &b[9], &b[14]);
+  butterfly_two_coeff(a[13], a[10], cospi_10_64, cospi_22_64, &b[10], &b[13]);
+  butterfly_two_coeff(a[12], a[11], cospi_26_64, cospi_6_64, &b[11], &b[12]);
+
+  b[16] = vaddq_s16(a[16], a[17]);
+  b[17] = vsubq_s16(a[16], a[17]);
+  b[18] = vsubq_s16(a[19], a[18]);
+  b[19] = vaddq_s16(a[19], a[18]);
+  b[20] = vaddq_s16(a[20], a[21]);
+  b[21] = vsubq_s16(a[20], a[21]);
+  b[22] = vsubq_s16(a[23], a[22]);
+  b[23] = vaddq_s16(a[23], a[22]);
+  b[24] = vaddq_s16(a[24], a[25]);
+  b[25] = vsubq_s16(a[24], a[25]);
+  b[26] = vsubq_s16(a[27], a[26]);
+  b[27] = vaddq_s16(a[27], a[26]);
+  b[28] = vaddq_s16(a[28], a[29]);
+  b[29] = vsubq_s16(a[28], a[29]);
+  b[30] = vsubq_s16(a[31], a[30]);
+  b[31] = vaddq_s16(a[31], a[30]);
+
+  // Final stage.
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+  out[0] = sub_round_shift_s16(b[0]);
+  out[16] = sub_round_shift_s16(b[1]);
+  out[8] = sub_round_shift_s16(b[2]);
+  out[24] = sub_round_shift_s16(b[3]);
+  out[4] = sub_round_shift_s16(b[4]);
+  out[20] = sub_round_shift_s16(b[5]);
+  out[12] = sub_round_shift_s16(b[6]);
+  out[28] = sub_round_shift_s16(b[7]);
+  out[2] = sub_round_shift_s16(b[8]);
+  out[18] = sub_round_shift_s16(b[9]);
+  out[10] = sub_round_shift_s16(b[10]);
+  out[26] = sub_round_shift_s16(b[11]);
+  out[6] = sub_round_shift_s16(b[12]);
+  out[22] = sub_round_shift_s16(b[13]);
+  out[14] = sub_round_shift_s16(b[14]);
+  out[30] = sub_round_shift_s16(b[15]);
+
+  butterfly_two_coeff(b[31], b[16], cospi_1_64, cospi_31_64, &a[1], &a[31]);
+  out[1] = sub_round_shift_s16(a[1]);
+  out[31] = sub_round_shift_s16(a[31]);
+
+  butterfly_two_coeff(b[30], b[17], cospi_17_64, cospi_15_64, &a[17], &a[15]);
+  out[17] = sub_round_shift_s16(a[17]);
+  out[15] = sub_round_shift_s16(a[15]);
+
+  butterfly_two_coeff(b[29], b[18], cospi_9_64, cospi_23_64, &a[9], &a[23]);
+  out[9] = sub_round_shift_s16(a[9]);
+  out[23] = sub_round_shift_s16(a[23]);
+
+  butterfly_two_coeff(b[28], b[19], cospi_25_64, cospi_7_64, &a[25], &a[7]);
+  out[25] = sub_round_shift_s16(a[25]);
+  out[7] = sub_round_shift_s16(a[7]);
+
+  butterfly_two_coeff(b[27], b[20], cospi_5_64, cospi_27_64, &a[5], &a[27]);
+  out[5] = sub_round_shift_s16(a[5]);
+  out[27] = sub_round_shift_s16(a[27]);
+
+  butterfly_two_coeff(b[26], b[21], cospi_21_64, cospi_11_64, &a[21], &a[11]);
+  out[21] = sub_round_shift_s16(a[21]);
+  out[11] = sub_round_shift_s16(a[11]);
+
+  butterfly_two_coeff(b[25], b[22], cospi_13_64, cospi_19_64, &a[13], &a[19]);
+  out[13] = sub_round_shift_s16(a[13]);
+  out[19] = sub_round_shift_s16(a[19]);
+
+  butterfly_two_coeff(b[24], b[23], cospi_29_64, cospi_3_64, &a[29], &a[3]);
+  out[29] = sub_round_shift_s16(a[29]);
+  out[3] = sub_round_shift_s16(a[3]);
+}
+
+#define PASS_THROUGH(src, dst, element)    \
+  do {                                     \
+    dst##_lo[element] = src##_lo[element]; \
+    dst##_hi[element] = src##_hi[element]; \
+  } while (0)
+
+#define ADD_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vaddl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vaddl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define SUB_S16_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                        \
+    b##_lo[b_index] =                                                         \
+        vsubl_s16(vget_low_s16(a[left_index]), vget_low_s16(a[right_index])); \
+    b##_hi[b_index] = vsubl_s16(vget_high_s16(a[left_index]),                 \
+                                vget_high_s16(a[right_index]));               \
+  } while (0)
+
+#define ADDW_S16_S32(a, a_index, b, b_index, c, c_index)                     \
+  do {                                                                       \
+    c##_lo[c_index] = vaddw_s16(a##_lo[a_index], vget_low_s16(b[b_index]));  \
+    c##_hi[c_index] = vaddw_s16(a##_hi[a_index], vget_high_s16(b[b_index])); \
+  } while (0)
+
+#define SUBW_S16_S32(a, a_index, b, b_index, temp, temp_index, c, c_index) \
+  do {                                                                     \
+    temp##_lo[temp_index] = vmovl_s16(vget_low_s16(a[a_index]));           \
+    temp##_hi[temp_index] = vmovl_s16(vget_high_s16(a[a_index]));          \
+    c##_lo[c_index] = vsubq_s32(temp##_lo[temp_index], b##_lo[b_index]);   \
+    c##_hi[c_index] = vsubq_s32(temp##_hi[temp_index], b##_hi[b_index]);   \
+  } while (0)
+
+#define ADD_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vaddq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vaddq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define SUB_S32(a, left_index, right_index, b, b_index)                   \
+  do {                                                                    \
+    b##_lo[b_index] = vsubq_s32(a##_lo[left_index], a##_lo[right_index]); \
+    b##_hi[b_index] = vsubq_s32(a##_hi[left_index], a##_hi[right_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S16_S32(a, left_index, right_index, constant, b,   \
+                              add_index, sub_index)                      \
+  do {                                                                   \
+    butterfly_one_coeff_s16_s32(a[left_index], a[right_index], constant, \
+                                &b##_lo[add_index], &b##_hi[add_index],  \
+                                &b##_lo[sub_index], &b##_hi[sub_index]); \
+  } while (0)
+
+#define BUTTERFLY_ONE_S32(a, left_index, right_index, constant, b, add_index,  \
+                          sub_index)                                           \
+  do {                                                                         \
+    butterfly_one_coeff_s32_fast(                                              \
+        a##_lo[left_index], a##_hi[left_index], a##_lo[right_index],           \
+        a##_hi[right_index], constant, &b##_lo[add_index], &b##_hi[add_index], \
+        &b##_lo[sub_index], &b##_hi[sub_index]);                               \
+  } while (0)
+
+#define BUTTERFLY_TWO_S32(a, left_index, right_index, left_constant,           \
+                          right_constant, b, add_index, sub_index)             \
+  do {                                                                         \
+    butterfly_two_coeff_s32(a##_lo[left_index], a##_hi[left_index],            \
+                            a##_lo[right_index], a##_hi[right_index],          \
+                            left_constant, right_constant, &b##_lo[add_index], \
+                            &b##_hi[add_index], &b##_lo[sub_index],            \
+                            &b##_hi[sub_index]);                               \
+  } while (0)
+
+static INLINE void dct_body_second_pass(const int16x8_t *in, int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+  int32x4_t c_lo[32];
+  int32x4_t c_hi[32];
+  int32x4_t d_lo[32];
+  int32x4_t d_hi[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  b[0] = vaddq_s16(a[0], a[15]);
+  b[1] = vaddq_s16(a[1], a[14]);
+  b[2] = vaddq_s16(a[2], a[13]);
+  b[3] = vaddq_s16(a[3], a[12]);
+  b[4] = vaddq_s16(a[4], a[11]);
+  b[5] = vaddq_s16(a[5], a[10]);
+  b[6] = vaddq_s16(a[6], a[9]);
+  b[7] = vaddq_s16(a[7], a[8]);
+
+  b[8] = vsubq_s16(a[7], a[8]);
+  b[9] = vsubq_s16(a[6], a[9]);
+  b[10] = vsubq_s16(a[5], a[10]);
+  b[11] = vsubq_s16(a[4], a[11]);
+  b[12] = vsubq_s16(a[3], a[12]);
+  b[13] = vsubq_s16(a[2], a[13]);
+  b[14] = vsubq_s16(a[1], a[14]);
+  b[15] = vsubq_s16(a[0], a[15]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+  b[18] = a[18];
+  b[19] = a[19];
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+
+  b[28] = a[28];
+  b[29] = a[29];
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 3. With extreme values for input this calculation rolls over int16_t.
+  // The sources for b[0] get added multiple times and, through testing, have
+  // been shown to overflow starting here.
+  ADD_S16_S32(b, 0, 7, c, 0);
+  ADD_S16_S32(b, 1, 6, c, 1);
+  ADD_S16_S32(b, 2, 5, c, 2);
+  ADD_S16_S32(b, 3, 4, c, 3);
+  SUB_S16_S32(b, 3, 4, c, 4);
+  SUB_S16_S32(b, 2, 5, c, 5);
+  SUB_S16_S32(b, 1, 6, c, 6);
+  SUB_S16_S32(b, 0, 7, c, 7);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  BUTTERFLY_ONE_S16_S32(b, 13, 10, cospi_16_64, c, 13, 10);
+  BUTTERFLY_ONE_S16_S32(b, 12, 11, cospi_16_64, c, 12, 11);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  ADD_S16_S32(b, 16, 23, c, 16);
+  ADD_S16_S32(b, 17, 22, c, 17);
+  ADD_S16_S32(b, 18, 21, c, 18);
+  ADD_S16_S32(b, 19, 20, c, 19);
+  SUB_S16_S32(b, 19, 20, c, 20);
+  SUB_S16_S32(b, 18, 21, c, 21);
+  SUB_S16_S32(b, 17, 22, c, 22);
+  SUB_S16_S32(b, 16, 23, c, 23);
+  SUB_S16_S32(b, 31, 24, c, 24);
+  SUB_S16_S32(b, 30, 25, c, 25);
+  SUB_S16_S32(b, 29, 26, c, 26);
+  SUB_S16_S32(b, 28, 27, c, 27);
+  ADD_S16_S32(b, 28, 27, c, 28);
+  ADD_S16_S32(b, 29, 26, c, 29);
+  ADD_S16_S32(b, 30, 25, c, 30);
+  ADD_S16_S32(b, 31, 24, c, 31);
+
+  // Stage 4.
+  ADD_S32(c, 0, 3, d, 0);
+  ADD_S32(c, 1, 2, d, 1);
+  SUB_S32(c, 1, 2, d, 2);
+  SUB_S32(c, 0, 3, d, 3);
+
+  PASS_THROUGH(c, d, 4);
+
+  BUTTERFLY_ONE_S32(c, 6, 5, cospi_16_64, d, 6, 5);
+
+  PASS_THROUGH(c, d, 7);
+
+  ADDW_S16_S32(c, 11, a, 8, d, 8);
+  ADDW_S16_S32(c, 10, a, 9, d, 9);
+  SUBW_S16_S32(a, 9, c, 10, c, 9, d, 10);
+  SUBW_S16_S32(a, 8, c, 11, c, 8, d, 11);
+  SUBW_S16_S32(a, 15, c, 12, c, 15, d, 12);
+  SUBW_S16_S32(a, 14, c, 13, c, 14, d, 13);
+  ADDW_S16_S32(c, 13, b, 14, d, 14);
+  ADDW_S16_S32(c, 12, b, 15, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 17);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_8_64, cospi_24_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_8_64, cospi_24_64, d, 28, 19);
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_24_64, -cospi_8_64, d, 27, 20);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_24_64, -cospi_8_64, d, 26, 21);
+
+  PASS_THROUGH(c, d, 22);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 25);
+
+  PASS_THROUGH(c, d, 30);
+  PASS_THROUGH(c, d, 31);
+
+  // Stage 5.
+  BUTTERFLY_ONE_S32(d, 0, 1, cospi_16_64, c, 0, 1);
+  BUTTERFLY_TWO_S32(d, 3, 2, cospi_8_64, cospi_24_64, c, 2, 3);
+
+  ADD_S32(d, 4, 5, c, 4);
+  SUB_S32(d, 4, 5, c, 5);
+  SUB_S32(d, 7, 6, c, 6);
+  ADD_S32(d, 7, 6, c, 7);
+
+  PASS_THROUGH(d, c, 8);
+
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_8_64, cospi_24_64, c, 14, 9);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_24_64, -cospi_8_64, c, 13, 10);
+
+  PASS_THROUGH(d, c, 11);
+  PASS_THROUGH(d, c, 12);
+  PASS_THROUGH(d, c, 15);
+
+  ADD_S32(d, 16, 19, c, 16);
+  ADD_S32(d, 17, 18, c, 17);
+  SUB_S32(d, 17, 18, c, 18);
+  SUB_S32(d, 16, 19, c, 19);
+  SUB_S32(d, 23, 20, c, 20);
+  SUB_S32(d, 22, 21, c, 21);
+  ADD_S32(d, 22, 21, c, 22);
+  ADD_S32(d, 23, 20, c, 23);
+  ADD_S32(d, 24, 27, c, 24);
+  ADD_S32(d, 25, 26, c, 25);
+  SUB_S32(d, 25, 26, c, 26);
+  SUB_S32(d, 24, 27, c, 27);
+  SUB_S32(d, 31, 28, c, 28);
+  SUB_S32(d, 30, 29, c, 29);
+  ADD_S32(d, 30, 29, c, 30);
+  ADD_S32(d, 31, 28, c, 31);
+
+  // Stage 6.
+  PASS_THROUGH(c, d, 0);
+  PASS_THROUGH(c, d, 1);
+  PASS_THROUGH(c, d, 2);
+  PASS_THROUGH(c, d, 3);
+
+  BUTTERFLY_TWO_S32(c, 7, 4, cospi_4_64, cospi_28_64, d, 4, 7);
+  BUTTERFLY_TWO_S32(c, 6, 5, cospi_20_64, cospi_12_64, d, 5, 6);
+
+  ADD_S32(c, 8, 9, d, 8);
+  SUB_S32(c, 8, 9, d, 9);
+  SUB_S32(c, 11, 10, d, 10);
+  ADD_S32(c, 11, 10, d, 11);
+  ADD_S32(c, 12, 13, d, 12);
+  SUB_S32(c, 12, 13, d, 13);
+  SUB_S32(c, 15, 14, d, 14);
+  ADD_S32(c, 15, 14, d, 15);
+
+  PASS_THROUGH(c, d, 16);
+  PASS_THROUGH(c, d, 19);
+  PASS_THROUGH(c, d, 20);
+  PASS_THROUGH(c, d, 23);
+  PASS_THROUGH(c, d, 24);
+  PASS_THROUGH(c, d, 27);
+  PASS_THROUGH(c, d, 28);
+  PASS_THROUGH(c, d, 31);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_4_64, cospi_28_64, d, 30, 17);
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_28_64, -cospi_4_64, d, 29, 18);
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_20_64, cospi_12_64, d, 26, 21);
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_12_64, -cospi_20_64, d, 25, 22);
+
+  // Stage 7.
+  PASS_THROUGH(d, c, 0);
+  PASS_THROUGH(d, c, 1);
+  PASS_THROUGH(d, c, 2);
+  PASS_THROUGH(d, c, 3);
+  PASS_THROUGH(d, c, 4);
+  PASS_THROUGH(d, c, 5);
+  PASS_THROUGH(d, c, 6);
+  PASS_THROUGH(d, c, 7);
+
+  BUTTERFLY_TWO_S32(d, 15, 8, cospi_2_64, cospi_30_64, c, 8, 15);
+  BUTTERFLY_TWO_S32(d, 14, 9, cospi_18_64, cospi_14_64, c, 9, 14);
+  BUTTERFLY_TWO_S32(d, 13, 10, cospi_10_64, cospi_22_64, c, 10, 13);
+  BUTTERFLY_TWO_S32(d, 12, 11, cospi_26_64, cospi_6_64, c, 11, 12);
+
+  ADD_S32(d, 16, 17, c, 16);
+  SUB_S32(d, 16, 17, c, 17);
+  SUB_S32(d, 19, 18, c, 18);
+  ADD_S32(d, 19, 18, c, 19);
+  ADD_S32(d, 20, 21, c, 20);
+  SUB_S32(d, 20, 21, c, 21);
+  SUB_S32(d, 23, 22, c, 22);
+  ADD_S32(d, 23, 22, c, 23);
+  ADD_S32(d, 24, 25, c, 24);
+  SUB_S32(d, 24, 25, c, 25);
+  SUB_S32(d, 27, 26, c, 26);
+  ADD_S32(d, 27, 26, c, 27);
+  ADD_S32(d, 28, 29, c, 28);
+  SUB_S32(d, 28, 29, c, 29);
+  SUB_S32(d, 31, 30, c, 30);
+  ADD_S32(d, 31, 30, c, 31);
+
+  // Final stage.
+  // Roll rounding into this function so we can pass back int16x8.
+
+  out[0] = add_round_shift_s32_narrow(c_lo[0], c_hi[0]);
+  out[16] = add_round_shift_s32_narrow(c_lo[1], c_hi[1]);
+
+  out[8] = add_round_shift_s32_narrow(c_lo[2], c_hi[2]);
+  out[24] = add_round_shift_s32_narrow(c_lo[3], c_hi[3]);
+  out[4] = add_round_shift_s32_narrow(c_lo[4], c_hi[4]);
+  out[20] = add_round_shift_s32_narrow(c_lo[5], c_hi[5]);
+  out[12] = add_round_shift_s32_narrow(c_lo[6], c_hi[6]);
+
+  out[28] = add_round_shift_s32_narrow(c_lo[7], c_hi[7]);
+  out[2] = add_round_shift_s32_narrow(c_lo[8], c_hi[8]);
+  out[18] = add_round_shift_s32_narrow(c_lo[9], c_hi[9]);
+  out[10] = add_round_shift_s32_narrow(c_lo[10], c_hi[10]);
+
+  out[26] = add_round_shift_s32_narrow(c_lo[11], c_hi[11]);
+  out[6] = add_round_shift_s32_narrow(c_lo[12], c_hi[12]);
+  out[22] = add_round_shift_s32_narrow(c_lo[13], c_hi[13]);
+  out[14] = add_round_shift_s32_narrow(c_lo[14], c_hi[14]);
+  out[30] = add_round_shift_s32_narrow(c_lo[15], c_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 31, 16, cospi_1_64, cospi_31_64, d, 1, 31);
+  out[1] = add_round_shift_s32_narrow(d_lo[1], d_hi[1]);
+  out[31] = add_round_shift_s32_narrow(d_lo[31], d_hi[31]);
+
+  BUTTERFLY_TWO_S32(c, 30, 17, cospi_17_64, cospi_15_64, d, 17, 15);
+  out[17] = add_round_shift_s32_narrow(d_lo[17], d_hi[17]);
+  out[15] = add_round_shift_s32_narrow(d_lo[15], d_hi[15]);
+
+  BUTTERFLY_TWO_S32(c, 29, 18, cospi_9_64, cospi_23_64, d, 9, 23);
+  out[9] = add_round_shift_s32_narrow(d_lo[9], d_hi[9]);
+  out[23] = add_round_shift_s32_narrow(d_lo[23], d_hi[23]);
+
+  BUTTERFLY_TWO_S32(c, 28, 19, cospi_25_64, cospi_7_64, d, 25, 7);
+  out[25] = add_round_shift_s32_narrow(d_lo[25], d_hi[25]);
+  out[7] = add_round_shift_s32_narrow(d_lo[7], d_hi[7]);
+
+  BUTTERFLY_TWO_S32(c, 27, 20, cospi_5_64, cospi_27_64, d, 5, 27);
+  out[5] = add_round_shift_s32_narrow(d_lo[5], d_hi[5]);
+  out[27] = add_round_shift_s32_narrow(d_lo[27], d_hi[27]);
+
+  BUTTERFLY_TWO_S32(c, 26, 21, cospi_21_64, cospi_11_64, d, 21, 11);
+  out[21] = add_round_shift_s32_narrow(d_lo[21], d_hi[21]);
+  out[11] = add_round_shift_s32_narrow(d_lo[11], d_hi[11]);
+
+  BUTTERFLY_TWO_S32(c, 25, 22, cospi_13_64, cospi_19_64, d, 13, 19);
+  out[13] = add_round_shift_s32_narrow(d_lo[13], d_hi[13]);
+  out[19] = add_round_shift_s32_narrow(d_lo[19], d_hi[19]);
+
+  BUTTERFLY_TWO_S32(c, 24, 23, cospi_29_64, cospi_3_64, d, 29, 3);
+  out[29] = add_round_shift_s32_narrow(d_lo[29], d_hi[29]);
+  out[3] = add_round_shift_s32_narrow(d_lo[3], d_hi[3]);
+}
+
+static INLINE void dct_body_second_pass_rd(const int16x8_t *in,
+                                           int16x8_t *out) {
+  int16x8_t a[32];
+  int16x8_t b[32];
+
+  // Stage 1. Done as part of the load for the first pass.
+  a[0] = vaddq_s16(in[0], in[31]);
+  a[1] = vaddq_s16(in[1], in[30]);
+  a[2] = vaddq_s16(in[2], in[29]);
+  a[3] = vaddq_s16(in[3], in[28]);
+  a[4] = vaddq_s16(in[4], in[27]);
+  a[5] = vaddq_s16(in[5], in[26]);
+  a[6] = vaddq_s16(in[6], in[25]);
+  a[7] = vaddq_s16(in[7], in[24]);
+  a[8] = vaddq_s16(in[8], in[23]);
+  a[9] = vaddq_s16(in[9], in[22]);
+  a[10] = vaddq_s16(in[10], in[21]);
+  a[11] = vaddq_s16(in[11], in[20]);
+  a[12] = vaddq_s16(in[12], in[19]);
+  a[13] = vaddq_s16(in[13], in[18]);
+  a[14] = vaddq_s16(in[14], in[17]);
+  a[15] = vaddq_s16(in[15], in[16]);
+  a[16] = vsubq_s16(in[15], in[16]);
+  a[17] = vsubq_s16(in[14], in[17]);
+  a[18] = vsubq_s16(in[13], in[18]);
+  a[19] = vsubq_s16(in[12], in[19]);
+  a[20] = vsubq_s16(in[11], in[20]);
+  a[21] = vsubq_s16(in[10], in[21]);
+  a[22] = vsubq_s16(in[9], in[22]);
+  a[23] = vsubq_s16(in[8], in[23]);
+  a[24] = vsubq_s16(in[7], in[24]);
+  a[25] = vsubq_s16(in[6], in[25]);
+  a[26] = vsubq_s16(in[5], in[26]);
+  a[27] = vsubq_s16(in[4], in[27]);
+  a[28] = vsubq_s16(in[3], in[28]);
+  a[29] = vsubq_s16(in[2], in[29]);
+  a[30] = vsubq_s16(in[1], in[30]);
+  a[31] = vsubq_s16(in[0], in[31]);
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  b[0] = add_round_shift_s16(vaddq_s16(a[0], a[15]));
+  b[1] = add_round_shift_s16(vaddq_s16(a[1], a[14]));
+  b[2] = add_round_shift_s16(vaddq_s16(a[2], a[13]));
+  b[3] = add_round_shift_s16(vaddq_s16(a[3], a[12]));
+  b[4] = add_round_shift_s16(vaddq_s16(a[4], a[11]));
+  b[5] = add_round_shift_s16(vaddq_s16(a[5], a[10]));
+  b[6] = add_round_shift_s16(vaddq_s16(a[6], a[9]));
+  b[7] = add_round_shift_s16(vaddq_s16(a[7], a[8]));
+
+  b[8] = add_round_shift_s16(vsubq_s16(a[7], a[8]));
+  b[9] = add_round_shift_s16(vsubq_s16(a[6], a[9]));
+  b[10] = add_round_shift_s16(vsubq_s16(a[5], a[10]));
+  b[11] = add_round_shift_s16(vsubq_s16(a[4], a[11]));
+  b[12] = add_round_shift_s16(vsubq_s16(a[3], a[12]));
+  b[13] = add_round_shift_s16(vsubq_s16(a[2], a[13]));
+  b[14] = add_round_shift_s16(vsubq_s16(a[1], a[14]));
+  b[15] = add_round_shift_s16(vsubq_s16(a[0], a[15]));
+
+  b[16] = add_round_shift_s16(a[16]);
+  b[17] = add_round_shift_s16(a[17]);
+  b[18] = add_round_shift_s16(a[18]);
+  b[19] = add_round_shift_s16(a[19]);
+
+  butterfly_one_coeff_s16_s32_narrow(a[27], a[20], cospi_16_64, &b[27], &b[20]);
+  butterfly_one_coeff_s16_s32_narrow(a[26], a[21], cospi_16_64, &b[26], &b[21]);
+  butterfly_one_coeff_s16_s32_narrow(a[25], a[22], cospi_16_64, &b[25], &b[22]);
+  butterfly_one_coeff_s16_s32_narrow(a[24], a[23], cospi_16_64, &b[24], &b[23]);
+  b[20] = add_round_shift_s16(b[20]);
+  b[21] = add_round_shift_s16(b[21]);
+  b[22] = add_round_shift_s16(b[22]);
+  b[23] = add_round_shift_s16(b[23]);
+  b[24] = add_round_shift_s16(b[24]);
+  b[25] = add_round_shift_s16(b[25]);
+  b[26] = add_round_shift_s16(b[26]);
+  b[27] = add_round_shift_s16(b[27]);
+
+  b[28] = add_round_shift_s16(a[28]);
+  b[29] = add_round_shift_s16(a[29]);
+  b[30] = add_round_shift_s16(a[30]);
+  b[31] = add_round_shift_s16(a[31]);
+
+  // Stage 3.
+  a[0] = vaddq_s16(b[0], b[7]);
+  a[1] = vaddq_s16(b[1], b[6]);
+  a[2] = vaddq_s16(b[2], b[5]);
+  a[3] = vaddq_s16(b[3], b[4]);
+
+  a[4] = vsubq_s16(b[3], b[4]);
+  a[5] = vsubq_s16(b[2], b[5]);
+  a[6] = vsubq_s16(b[1], b[6]);
+  a[7] = vsubq_s16(b[0], b[7]);
+
+  a[8] = b[8];
+  a[9] = b[9];
+
+  butterfly_one_coeff_s16_s32_narrow(b[13], b[10], cospi_16_64, &a[13], &a[10]);
+  butterfly_one_coeff_s16_s32_narrow(b[12], b[11], cospi_16_64, &a[12], &a[11]);
+
+  a[14] = b[14];
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[16], b[23]);
+  a[17] = vaddq_s16(b[17], b[22]);
+  a[18] = vaddq_s16(b[18], b[21]);
+  a[19] = vaddq_s16(b[19], b[20]);
+
+  a[20] = vsubq_s16(b[19], b[20]);
+  a[21] = vsubq_s16(b[18], b[21]);
+  a[22] = vsubq_s16(b[17], b[22]);
+  a[23] = vsubq_s16(b[16], b[23]);
+
+  a[24] = vsubq_s16(b[31], b[24]);
+  a[25] = vsubq_s16(b[30], b[25]);
+  a[26] = vsubq_s16(b[29], b[26]);
+  a[27] = vsubq_s16(b[28], b[27]);
+
+  a[28] = vaddq_s16(b[28], b[27]);
+  a[29] = vaddq_s16(b[29], b[26]);
+  a[30] = vaddq_s16(b[30], b[25]);
+  a[31] = vaddq_s16(b[31], b[24]);
+
+  // Stage 4.
+  b[0] = vaddq_s16(a[0], a[3]);
+  b[1] = vaddq_s16(a[1], a[2]);
+  b[2] = vsubq_s16(a[1], a[2]);
+  b[3] = vsubq_s16(a[0], a[3]);
+
+  b[4] = a[4];
+
+  butterfly_one_coeff_s16_s32_narrow(a[6], a[5], cospi_16_64, &b[6], &b[5]);
+
+  b[7] = a[7];
+
+  b[8] = vaddq_s16(a[8], a[11]);
+  b[9] = vaddq_s16(a[9], a[10]);
+  b[10] = vsubq_s16(a[9], a[10]);
+  b[11] = vsubq_s16(a[8], a[11]);
+  b[12] = vsubq_s16(a[15], a[12]);
+  b[13] = vsubq_s16(a[14], a[13]);
+  b[14] = vaddq_s16(a[14], a[13]);
+  b[15] = vaddq_s16(a[15], a[12]);
+
+  b[16] = a[16];
+  b[17] = a[17];
+
+  butterfly_two_coeff(a[29], a[18], cospi_8_64, cospi_24_64, &b[29], &b[18]);
+  butterfly_two_coeff(a[28], a[19], cospi_8_64, cospi_24_64, &b[28], &b[19]);
+  butterfly_two_coeff(a[27], a[20], cospi_24_64, -cospi_8_64, &b[27], &b[20]);
+  butterfly_two_coeff(a[26], a[21], cospi_24_64, -cospi_8_64, &b[26], &b[21]);
+
+  b[22] = a[22];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[25] = a[25];
+
+  b[30] = a[30];
+  b[31] = a[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s16_s32_narrow(b[0], b[1], cospi_16_64, &a[0], &a[1]);
+  butterfly_two_coeff(b[3], b[2], cospi_8_64, cospi_24_64, &a[2], &a[3]);
+
+  a[4] = vaddq_s16(b[4], b[5]);
+  a[5] = vsubq_s16(b[4], b[5]);
+  a[6] = vsubq_s16(b[7], b[6]);
+  a[7] = vaddq_s16(b[7], b[6]);
+
+  a[8] = b[8];
+
+  butterfly_two_coeff(b[14], b[9], cospi_8_64, cospi_24_64, &a[14], &a[9]);
+  butterfly_two_coeff(b[13], b[10], cospi_24_64, -cospi_8_64, &a[13], &a[10]);
+
+  a[11] = b[11];
+  a[12] = b[12];
+
+  a[15] = b[15];
+
+  a[16] = vaddq_s16(b[19], b[16]);
+  a[17] = vaddq_s16(b[18], b[17]);
+  a[18] = vsubq_s16(b[17], b[18]);
+  a[19] = vsubq_s16(b[16], b[19]);
+  a[20] = vsubq_s16(b[23], b[20]);
+  a[21] = vsubq_s16(b[22], b[21]);
+  a[22] = vaddq_s16(b[21], b[22]);
+  a[23] = vaddq_s16(b[20], b[23]);
+  a[24] = vaddq_s16(b[27], b[24]);
+  a[25] = vaddq_s16(b[26], b[25]);
+  a[26] = vsubq_s16(b[25], b[26]);
+  a[27] = vsubq_s16(b[24], b[27]);
+  a[28] = vsubq_s16(b[31], b[28]);
+  a[29] = vsubq_s16(b[30], b[29]);
+  a[30] = vaddq_s16(b[29], b[30]);
+  a[31] = vaddq_s16(b[28], b[31]);
+
+  // Stage 6.
+  b[0] = a[0];
+  b[1] = a[1];
+  b[2] = a[2];
+  b[3] = a[3];
+
+  butterfly_two_coeff(a[7], a[4], cospi_4_64, cospi_28_64, &b[4], &b[7]);
+  butterfly_two_coeff(a[6], a[5], cospi_20_64, cospi_12_64, &b[5], &b[6]);
+
+  b[8] = vaddq_s16(a[8], a[9]);
+  b[9] = vsubq_s16(a[8], a[9]);
+  b[10] = vsubq_s16(a[11], a[10]);
+  b[11] = vaddq_s16(a[11], a[10]);
+  b[12] = vaddq_s16(a[12], a[13]);
+  b[13] = vsubq_s16(a[12], a[13]);
+  b[14] = vsubq_s16(a[15], a[14]);
+  b[15] = vaddq_s16(a[15], a[14]);
+
+  b[16] = a[16];
+  b[19] = a[19];
+  b[20] = a[20];
+  b[23] = a[23];
+  b[24] = a[24];
+  b[27] = a[27];
+  b[28] = a[28];
+  b[31] = a[31];
+
+  butterfly_two_coeff(a[30], a[17], cospi_4_64, cospi_28_64, &b[30], &b[17]);
+  butterfly_two_coeff(a[29], a[18], cospi_28_64, -cospi_4_64, &b[29], &b[18]);
+
+  butterfly_two_coeff(a[26], a[21], cospi_20_64, cospi_12_64, &b[26], &b[21]);
+  butterfly_two_coeff(a[25], a[22], cospi_12_64, -cospi_20_64, &b[25], &b[22]);
+
+  // Stage 7.
+  a[0] = b[0];
+  a[1] = b[1];
+  a[2] = b[2];
+  a[3] = b[3];
+  a[4] = b[4];
+  a[5] = b[5];
+  a[6] = b[6];
+  a[7] = b[7];
+
+  butterfly_two_coeff(b[15], b[8], cospi_2_64, cospi_30_64, &a[8], &a[15]);
+  butterfly_two_coeff(b[14], b[9], cospi_18_64, cospi_14_64, &a[9], &a[14]);
+  butterfly_two_coeff(b[13], b[10], cospi_10_64, cospi_22_64, &a[10], &a[13]);
+  butterfly_two_coeff(b[12], b[11], cospi_26_64, cospi_6_64, &a[11], &a[12]);
+
+  a[16] = vaddq_s16(b[16], b[17]);
+  a[17] = vsubq_s16(b[16], b[17]);
+  a[18] = vsubq_s16(b[19], b[18]);
+  a[19] = vaddq_s16(b[19], b[18]);
+  a[20] = vaddq_s16(b[20], b[21]);
+  a[21] = vsubq_s16(b[20], b[21]);
+  a[22] = vsubq_s16(b[23], b[22]);
+  a[23] = vaddq_s16(b[23], b[22]);
+  a[24] = vaddq_s16(b[24], b[25]);
+  a[25] = vsubq_s16(b[24], b[25]);
+  a[26] = vsubq_s16(b[27], b[26]);
+  a[27] = vaddq_s16(b[27], b[26]);
+  a[28] = vaddq_s16(b[28], b[29]);
+  a[29] = vsubq_s16(b[28], b[29]);
+  a[30] = vsubq_s16(b[31], b[30]);
+  a[31] = vaddq_s16(b[31], b[30]);
+
+  // Final stage.
+  out[0] = a[0];
+  out[16] = a[1];
+  out[8] = a[2];
+  out[24] = a[3];
+  out[4] = a[4];
+  out[20] = a[5];
+  out[12] = a[6];
+  out[28] = a[7];
+  out[2] = a[8];
+  out[18] = a[9];
+  out[10] = a[10];
+  out[26] = a[11];
+  out[6] = a[12];
+  out[22] = a[13];
+  out[14] = a[14];
+  out[30] = a[15];
+
+  butterfly_two_coeff(a[31], a[16], cospi_1_64, cospi_31_64, &out[1], &out[31]);
+  butterfly_two_coeff(a[30], a[17], cospi_17_64, cospi_15_64, &out[17],
+                      &out[15]);
+  butterfly_two_coeff(a[29], a[18], cospi_9_64, cospi_23_64, &out[9], &out[23]);
+  butterfly_two_coeff(a[28], a[19], cospi_25_64, cospi_7_64, &out[25], &out[7]);
+  butterfly_two_coeff(a[27], a[20], cospi_5_64, cospi_27_64, &out[5], &out[27]);
+  butterfly_two_coeff(a[26], a[21], cospi_21_64, cospi_11_64, &out[21],
+                      &out[11]);
+  butterfly_two_coeff(a[25], a[22], cospi_13_64, cospi_19_64, &out[13],
+                      &out[19]);
+  butterfly_two_coeff(a[24], a[23], cospi_29_64, cospi_3_64, &out[29], &out[3]);
+}
+
+#undef PASS_THROUGH
+#undef ADD_S16_S32
+#undef SUB_S16_S32
+#undef ADDW_S16_S32
+#undef SUBW_S16_S32
+#undef ADD_S32
+#undef SUB_S32
+#undef BUTTERFLY_ONE_S16_S32
+#undef BUTTERFLY_ONE_S32
+#undef BUTTERFLY_TWO_S32
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Store 32 32x4 vectors, assuming stride == 32.
+static INLINE void store32x32_s32(
+    tran_low_t *a, const int32x4_t *l1 /*[16]*/, const int32x4_t *r1 /*[16]*/,
+    const int32x4_t *l2 /*[16]*/, const int32x4_t *r2 /*[16]*/,
+    const int32x4_t *l3 /*[16]*/, const int32x4_t *r3 /*[16]*/,
+    const int32x4_t *l4 /*[16]*/, const int32x4_t *r4 /*[16]*/) {
+  int i;
+  for (i = 0; i < 32; i++) {
+    vst1q_s32(a, l1[i]);
+    vst1q_s32(a + 4, r1[i]);
+    vst1q_s32(a + 8, l2[i]);
+    vst1q_s32(a + 12, r2[i]);
+    vst1q_s32(a + 16, l3[i]);
+    vst1q_s32(a + 20, r3[i]);
+    vst1q_s32(a + 24, l4[i]);
+    vst1q_s32(a + 28, r4[i]);
+    a += 32;
+  }
+}
+
+static INLINE void highbd_scale_input(const int16x8_t *a /*[32]*/,
+                                      int32x4_t *left /*[32]*/,
+                                      int32x4_t *right /* [32] */) {
+  left[0] = vshll_n_s16(vget_low_s16(a[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(a[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(a[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(a[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(a[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(a[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(a[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(a[7]), 2);
+  left[8] = vshll_n_s16(vget_low_s16(a[8]), 2);
+  left[9] = vshll_n_s16(vget_low_s16(a[9]), 2);
+  left[10] = vshll_n_s16(vget_low_s16(a[10]), 2);
+  left[11] = vshll_n_s16(vget_low_s16(a[11]), 2);
+  left[12] = vshll_n_s16(vget_low_s16(a[12]), 2);
+  left[13] = vshll_n_s16(vget_low_s16(a[13]), 2);
+  left[14] = vshll_n_s16(vget_low_s16(a[14]), 2);
+  left[15] = vshll_n_s16(vget_low_s16(a[15]), 2);
+  left[16] = vshll_n_s16(vget_low_s16(a[16]), 2);
+  left[17] = vshll_n_s16(vget_low_s16(a[17]), 2);
+  left[18] = vshll_n_s16(vget_low_s16(a[18]), 2);
+  left[19] = vshll_n_s16(vget_low_s16(a[19]), 2);
+  left[20] = vshll_n_s16(vget_low_s16(a[20]), 2);
+  left[21] = vshll_n_s16(vget_low_s16(a[21]), 2);
+  left[22] = vshll_n_s16(vget_low_s16(a[22]), 2);
+  left[23] = vshll_n_s16(vget_low_s16(a[23]), 2);
+  left[24] = vshll_n_s16(vget_low_s16(a[24]), 2);
+  left[25] = vshll_n_s16(vget_low_s16(a[25]), 2);
+  left[26] = vshll_n_s16(vget_low_s16(a[26]), 2);
+  left[27] = vshll_n_s16(vget_low_s16(a[27]), 2);
+  left[28] = vshll_n_s16(vget_low_s16(a[28]), 2);
+  left[29] = vshll_n_s16(vget_low_s16(a[29]), 2);
+  left[30] = vshll_n_s16(vget_low_s16(a[30]), 2);
+  left[31] = vshll_n_s16(vget_low_s16(a[31]), 2);
+
+  right[0] = vshll_n_s16(vget_high_s16(a[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(a[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(a[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(a[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(a[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(a[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(a[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(a[7]), 2);
+  right[8] = vshll_n_s16(vget_high_s16(a[8]), 2);
+  right[9] = vshll_n_s16(vget_high_s16(a[9]), 2);
+  right[10] = vshll_n_s16(vget_high_s16(a[10]), 2);
+  right[11] = vshll_n_s16(vget_high_s16(a[11]), 2);
+  right[12] = vshll_n_s16(vget_high_s16(a[12]), 2);
+  right[13] = vshll_n_s16(vget_high_s16(a[13]), 2);
+  right[14] = vshll_n_s16(vget_high_s16(a[14]), 2);
+  right[15] = vshll_n_s16(vget_high_s16(a[15]), 2);
+  right[16] = vshll_n_s16(vget_high_s16(a[16]), 2);
+  right[17] = vshll_n_s16(vget_high_s16(a[17]), 2);
+  right[18] = vshll_n_s16(vget_high_s16(a[18]), 2);
+  right[19] = vshll_n_s16(vget_high_s16(a[19]), 2);
+  right[20] = vshll_n_s16(vget_high_s16(a[20]), 2);
+  right[21] = vshll_n_s16(vget_high_s16(a[21]), 2);
+  right[22] = vshll_n_s16(vget_high_s16(a[22]), 2);
+  right[23] = vshll_n_s16(vget_high_s16(a[23]), 2);
+  right[24] = vshll_n_s16(vget_high_s16(a[24]), 2);
+  right[25] = vshll_n_s16(vget_high_s16(a[25]), 2);
+  right[26] = vshll_n_s16(vget_high_s16(a[26]), 2);
+  right[27] = vshll_n_s16(vget_high_s16(a[27]), 2);
+  right[28] = vshll_n_s16(vget_high_s16(a[28]), 2);
+  right[29] = vshll_n_s16(vget_high_s16(a[29]), 2);
+  right[30] = vshll_n_s16(vget_high_s16(a[30]), 2);
+  right[31] = vshll_n_s16(vget_high_s16(a[31]), 2);
+}
+
+static INLINE void highbd_cross_input(const int32x4_t *a_left /*[32]*/,
+                                      int32x4_t *a_right /*[32]*/,
+                                      int32x4_t *b_left /*[32]*/,
+                                      int32x4_t *b_right /*[32]*/) {
+  // Stage 1. Done as part of the load for the first pass.
+  b_left[0] = vaddq_s32(a_left[0], a_left[31]);
+  b_left[1] = vaddq_s32(a_left[1], a_left[30]);
+  b_left[2] = vaddq_s32(a_left[2], a_left[29]);
+  b_left[3] = vaddq_s32(a_left[3], a_left[28]);
+  b_left[4] = vaddq_s32(a_left[4], a_left[27]);
+  b_left[5] = vaddq_s32(a_left[5], a_left[26]);
+  b_left[6] = vaddq_s32(a_left[6], a_left[25]);
+  b_left[7] = vaddq_s32(a_left[7], a_left[24]);
+  b_left[8] = vaddq_s32(a_left[8], a_left[23]);
+  b_left[9] = vaddq_s32(a_left[9], a_left[22]);
+  b_left[10] = vaddq_s32(a_left[10], a_left[21]);
+  b_left[11] = vaddq_s32(a_left[11], a_left[20]);
+  b_left[12] = vaddq_s32(a_left[12], a_left[19]);
+  b_left[13] = vaddq_s32(a_left[13], a_left[18]);
+  b_left[14] = vaddq_s32(a_left[14], a_left[17]);
+  b_left[15] = vaddq_s32(a_left[15], a_left[16]);
+
+  b_right[0] = vaddq_s32(a_right[0], a_right[31]);
+  b_right[1] = vaddq_s32(a_right[1], a_right[30]);
+  b_right[2] = vaddq_s32(a_right[2], a_right[29]);
+  b_right[3] = vaddq_s32(a_right[3], a_right[28]);
+  b_right[4] = vaddq_s32(a_right[4], a_right[27]);
+  b_right[5] = vaddq_s32(a_right[5], a_right[26]);
+  b_right[6] = vaddq_s32(a_right[6], a_right[25]);
+  b_right[7] = vaddq_s32(a_right[7], a_right[24]);
+  b_right[8] = vaddq_s32(a_right[8], a_right[23]);
+  b_right[9] = vaddq_s32(a_right[9], a_right[22]);
+  b_right[10] = vaddq_s32(a_right[10], a_right[21]);
+  b_right[11] = vaddq_s32(a_right[11], a_right[20]);
+  b_right[12] = vaddq_s32(a_right[12], a_right[19]);
+  b_right[13] = vaddq_s32(a_right[13], a_right[18]);
+  b_right[14] = vaddq_s32(a_right[14], a_right[17]);
+  b_right[15] = vaddq_s32(a_right[15], a_right[16]);
+
+  b_left[16] = vsubq_s32(a_left[15], a_left[16]);
+  b_left[17] = vsubq_s32(a_left[14], a_left[17]);
+  b_left[18] = vsubq_s32(a_left[13], a_left[18]);
+  b_left[19] = vsubq_s32(a_left[12], a_left[19]);
+  b_left[20] = vsubq_s32(a_left[11], a_left[20]);
+  b_left[21] = vsubq_s32(a_left[10], a_left[21]);
+  b_left[22] = vsubq_s32(a_left[9], a_left[22]);
+  b_left[23] = vsubq_s32(a_left[8], a_left[23]);
+  b_left[24] = vsubq_s32(a_left[7], a_left[24]);
+  b_left[25] = vsubq_s32(a_left[6], a_left[25]);
+  b_left[26] = vsubq_s32(a_left[5], a_left[26]);
+  b_left[27] = vsubq_s32(a_left[4], a_left[27]);
+  b_left[28] = vsubq_s32(a_left[3], a_left[28]);
+  b_left[29] = vsubq_s32(a_left[2], a_left[29]);
+  b_left[30] = vsubq_s32(a_left[1], a_left[30]);
+  b_left[31] = vsubq_s32(a_left[0], a_left[31]);
+
+  b_right[16] = vsubq_s32(a_right[15], a_right[16]);
+  b_right[17] = vsubq_s32(a_right[14], a_right[17]);
+  b_right[18] = vsubq_s32(a_right[13], a_right[18]);
+  b_right[19] = vsubq_s32(a_right[12], a_right[19]);
+  b_right[20] = vsubq_s32(a_right[11], a_right[20]);
+  b_right[21] = vsubq_s32(a_right[10], a_right[21]);
+  b_right[22] = vsubq_s32(a_right[9], a_right[22]);
+  b_right[23] = vsubq_s32(a_right[8], a_right[23]);
+  b_right[24] = vsubq_s32(a_right[7], a_right[24]);
+  b_right[25] = vsubq_s32(a_right[6], a_right[25]);
+  b_right[26] = vsubq_s32(a_right[5], a_right[26]);
+  b_right[27] = vsubq_s32(a_right[4], a_right[27]);
+  b_right[28] = vsubq_s32(a_right[3], a_right[28]);
+  b_right[29] = vsubq_s32(a_right[2], a_right[29]);
+  b_right[30] = vsubq_s32(a_right[1], a_right[30]);
+  b_right[31] = vsubq_s32(a_right[0], a_right[31]);
+}
+
+static INLINE void highbd_partial_add_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = add_round_shift_s32(left[0]);
+  left[1] = add_round_shift_s32(left[1]);
+  left[2] = add_round_shift_s32(left[2]);
+  left[3] = add_round_shift_s32(left[3]);
+  left[4] = add_round_shift_s32(left[4]);
+  left[5] = add_round_shift_s32(left[5]);
+  left[6] = add_round_shift_s32(left[6]);
+  left[7] = add_round_shift_s32(left[7]);
+  left[8] = add_round_shift_s32(left[8]);
+  left[9] = add_round_shift_s32(left[9]);
+  left[10] = add_round_shift_s32(left[10]);
+  left[11] = add_round_shift_s32(left[11]);
+  left[12] = add_round_shift_s32(left[12]);
+  left[13] = add_round_shift_s32(left[13]);
+  left[14] = add_round_shift_s32(left[14]);
+  left[15] = add_round_shift_s32(left[15]);
+  left[16] = add_round_shift_s32(left[16]);
+  left[17] = add_round_shift_s32(left[17]);
+  left[18] = add_round_shift_s32(left[18]);
+  left[19] = add_round_shift_s32(left[19]);
+  left[20] = add_round_shift_s32(left[20]);
+  left[21] = add_round_shift_s32(left[21]);
+  left[22] = add_round_shift_s32(left[22]);
+  left[23] = add_round_shift_s32(left[23]);
+  left[24] = add_round_shift_s32(left[24]);
+  left[25] = add_round_shift_s32(left[25]);
+  left[26] = add_round_shift_s32(left[26]);
+  left[27] = add_round_shift_s32(left[27]);
+  left[28] = add_round_shift_s32(left[28]);
+  left[29] = add_round_shift_s32(left[29]);
+  left[30] = add_round_shift_s32(left[30]);
+  left[31] = add_round_shift_s32(left[31]);
+
+  right[0] = add_round_shift_s32(right[0]);
+  right[1] = add_round_shift_s32(right[1]);
+  right[2] = add_round_shift_s32(right[2]);
+  right[3] = add_round_shift_s32(right[3]);
+  right[4] = add_round_shift_s32(right[4]);
+  right[5] = add_round_shift_s32(right[5]);
+  right[6] = add_round_shift_s32(right[6]);
+  right[7] = add_round_shift_s32(right[7]);
+  right[8] = add_round_shift_s32(right[8]);
+  right[9] = add_round_shift_s32(right[9]);
+  right[10] = add_round_shift_s32(right[10]);
+  right[11] = add_round_shift_s32(right[11]);
+  right[12] = add_round_shift_s32(right[12]);
+  right[13] = add_round_shift_s32(right[13]);
+  right[14] = add_round_shift_s32(right[14]);
+  right[15] = add_round_shift_s32(right[15]);
+  right[16] = add_round_shift_s32(right[16]);
+  right[17] = add_round_shift_s32(right[17]);
+  right[18] = add_round_shift_s32(right[18]);
+  right[19] = add_round_shift_s32(right[19]);
+  right[20] = add_round_shift_s32(right[20]);
+  right[21] = add_round_shift_s32(right[21]);
+  right[22] = add_round_shift_s32(right[22]);
+  right[23] = add_round_shift_s32(right[23]);
+  right[24] = add_round_shift_s32(right[24]);
+  right[25] = add_round_shift_s32(right[25]);
+  right[26] = add_round_shift_s32(right[26]);
+  right[27] = add_round_shift_s32(right[27]);
+  right[28] = add_round_shift_s32(right[28]);
+  right[29] = add_round_shift_s32(right[29]);
+  right[30] = add_round_shift_s32(right[30]);
+  right[31] = add_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_partial_sub_round_shift(int32x4_t *left /*[32]*/,
+                                                  int32x4_t *right /* [32] */) {
+  // Also compute partial rounding shift:
+  // output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+
+  left[0] = sub_round_shift_s32(left[0]);
+  left[1] = sub_round_shift_s32(left[1]);
+  left[2] = sub_round_shift_s32(left[2]);
+  left[3] = sub_round_shift_s32(left[3]);
+  left[4] = sub_round_shift_s32(left[4]);
+  left[5] = sub_round_shift_s32(left[5]);
+  left[6] = sub_round_shift_s32(left[6]);
+  left[7] = sub_round_shift_s32(left[7]);
+  left[8] = sub_round_shift_s32(left[8]);
+  left[9] = sub_round_shift_s32(left[9]);
+  left[10] = sub_round_shift_s32(left[10]);
+  left[11] = sub_round_shift_s32(left[11]);
+  left[12] = sub_round_shift_s32(left[12]);
+  left[13] = sub_round_shift_s32(left[13]);
+  left[14] = sub_round_shift_s32(left[14]);
+  left[15] = sub_round_shift_s32(left[15]);
+  left[16] = sub_round_shift_s32(left[16]);
+  left[17] = sub_round_shift_s32(left[17]);
+  left[18] = sub_round_shift_s32(left[18]);
+  left[19] = sub_round_shift_s32(left[19]);
+  left[20] = sub_round_shift_s32(left[20]);
+  left[21] = sub_round_shift_s32(left[21]);
+  left[22] = sub_round_shift_s32(left[22]);
+  left[23] = sub_round_shift_s32(left[23]);
+  left[24] = sub_round_shift_s32(left[24]);
+  left[25] = sub_round_shift_s32(left[25]);
+  left[26] = sub_round_shift_s32(left[26]);
+  left[27] = sub_round_shift_s32(left[27]);
+  left[28] = sub_round_shift_s32(left[28]);
+  left[29] = sub_round_shift_s32(left[29]);
+  left[30] = sub_round_shift_s32(left[30]);
+  left[31] = sub_round_shift_s32(left[31]);
+
+  right[0] = sub_round_shift_s32(right[0]);
+  right[1] = sub_round_shift_s32(right[1]);
+  right[2] = sub_round_shift_s32(right[2]);
+  right[3] = sub_round_shift_s32(right[3]);
+  right[4] = sub_round_shift_s32(right[4]);
+  right[5] = sub_round_shift_s32(right[5]);
+  right[6] = sub_round_shift_s32(right[6]);
+  right[7] = sub_round_shift_s32(right[7]);
+  right[8] = sub_round_shift_s32(right[8]);
+  right[9] = sub_round_shift_s32(right[9]);
+  right[10] = sub_round_shift_s32(right[10]);
+  right[11] = sub_round_shift_s32(right[11]);
+  right[12] = sub_round_shift_s32(right[12]);
+  right[13] = sub_round_shift_s32(right[13]);
+  right[14] = sub_round_shift_s32(right[14]);
+  right[15] = sub_round_shift_s32(right[15]);
+  right[16] = sub_round_shift_s32(right[16]);
+  right[17] = sub_round_shift_s32(right[17]);
+  right[18] = sub_round_shift_s32(right[18]);
+  right[19] = sub_round_shift_s32(right[19]);
+  right[20] = sub_round_shift_s32(right[20]);
+  right[21] = sub_round_shift_s32(right[21]);
+  right[22] = sub_round_shift_s32(right[22]);
+  right[23] = sub_round_shift_s32(right[23]);
+  right[24] = sub_round_shift_s32(right[24]);
+  right[25] = sub_round_shift_s32(right[25]);
+  right[26] = sub_round_shift_s32(right[26]);
+  right[27] = sub_round_shift_s32(right[27]);
+  right[28] = sub_round_shift_s32(right[28]);
+  right[29] = sub_round_shift_s32(right[29]);
+  right[30] = sub_round_shift_s32(right[30]);
+  right[31] = sub_round_shift_s32(right[31]);
+}
+
+static INLINE void highbd_dct8x32_body_first_pass(int32x4_t *left /*32*/,
+                                                  int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass(int32x4_t *left /*32*/,
+                                                   int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // Mini cross. X the first 16 values and the middle 8 of the second half.
+  al[0] = vaddq_s32(left[0], left[15]);
+  ar[0] = vaddq_s32(right[0], right[15]);
+  al[1] = vaddq_s32(left[1], left[14]);
+  ar[1] = vaddq_s32(right[1], right[14]);
+  al[2] = vaddq_s32(left[2], left[13]);
+  ar[2] = vaddq_s32(right[2], right[13]);
+  al[3] = vaddq_s32(left[3], left[12]);
+  ar[3] = vaddq_s32(right[3], right[12]);
+  al[4] = vaddq_s32(left[4], left[11]);
+  ar[4] = vaddq_s32(right[4], right[11]);
+  al[5] = vaddq_s32(left[5], left[10]);
+  ar[5] = vaddq_s32(right[5], right[10]);
+  al[6] = vaddq_s32(left[6], left[9]);
+  ar[6] = vaddq_s32(right[6], right[9]);
+  al[7] = vaddq_s32(left[7], left[8]);
+  ar[7] = vaddq_s32(right[7], right[8]);
+
+  al[8] = vsubq_s32(left[7], left[8]);
+  ar[8] = vsubq_s32(right[7], right[8]);
+  al[9] = vsubq_s32(left[6], left[9]);
+  ar[9] = vsubq_s32(right[6], right[9]);
+  al[10] = vsubq_s32(left[5], left[10]);
+  ar[10] = vsubq_s32(right[5], right[10]);
+  al[11] = vsubq_s32(left[4], left[11]);
+  ar[11] = vsubq_s32(right[4], right[11]);
+  al[12] = vsubq_s32(left[3], left[12]);
+  ar[12] = vsubq_s32(right[3], right[12]);
+  al[13] = vsubq_s32(left[2], left[13]);
+  ar[13] = vsubq_s32(right[2], right[13]);
+  al[14] = vsubq_s32(left[1], left[14]);
+  ar[14] = vsubq_s32(right[1], right[14]);
+  al[15] = vsubq_s32(left[0], left[15]);
+  ar[15] = vsubq_s32(right[0], right[15]);
+
+  al[16] = left[16];
+  ar[16] = right[16];
+  al[17] = left[17];
+  ar[17] = right[17];
+  al[18] = left[18];
+  ar[18] = right[18];
+  al[19] = left[19];
+  ar[19] = right[19];
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[28] = left[28];
+  ar[28] = right[28];
+  al[29] = left[29];
+  ar[29] = right[29];
+  al[30] = left[30];
+  ar[30] = right[30];
+  al[31] = left[31];
+  ar[31] = right[31];
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(left[16], al[23]);
+  br[16] = vaddq_s32(right[16], ar[23]);
+  bl[17] = vaddq_s32(left[17], al[22]);
+  br[17] = vaddq_s32(right[17], ar[22]);
+  bl[18] = vaddq_s32(left[18], al[21]);
+  br[18] = vaddq_s32(right[18], ar[21]);
+  bl[19] = vaddq_s32(left[19], al[20]);
+  br[19] = vaddq_s32(right[19], ar[20]);
+
+  bl[20] = vsubq_s32(left[19], al[20]);
+  br[20] = vsubq_s32(right[19], ar[20]);
+  bl[21] = vsubq_s32(left[18], al[21]);
+  br[21] = vsubq_s32(right[18], ar[21]);
+  bl[22] = vsubq_s32(left[17], al[22]);
+  br[22] = vsubq_s32(right[17], ar[22]);
+  bl[23] = vsubq_s32(left[16], al[23]);
+  br[23] = vsubq_s32(right[16], ar[23]);
+
+  bl[24] = vsubq_s32(left[31], al[24]);
+  br[24] = vsubq_s32(right[31], ar[24]);
+  bl[25] = vsubq_s32(left[30], al[25]);
+  br[25] = vsubq_s32(right[30], ar[25]);
+  bl[26] = vsubq_s32(left[29], al[26]);
+  br[26] = vsubq_s32(right[29], ar[26]);
+  bl[27] = vsubq_s32(left[28], al[27]);
+  br[27] = vsubq_s32(right[28], ar[27]);
+
+  bl[28] = vaddq_s32(left[28], al[27]);
+  br[28] = vaddq_s32(right[28], ar[27]);
+  bl[29] = vaddq_s32(left[29], al[26]);
+  br[29] = vaddq_s32(right[29], ar[26]);
+  bl[30] = vaddq_s32(left[30], al[25]);
+  br[30] = vaddq_s32(right[30], ar[25]);
+  bl[31] = vaddq_s32(left[31], al[24]);
+  br[31] = vaddq_s32(right[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                                     cospi_24_64, &al[29], &ar[29], &al[18],
+                                     &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                                     cospi_24_64, &al[28], &ar[28], &al[19],
+                                     &ar[19]);
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20],
+                                     cospi_24_64, -cospi_8_64, &al[27], &ar[27],
+                                     &al[20], &ar[20]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_24_64, -cospi_8_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32_s64_narrow(al[3], ar[3], al[2], ar[2], cospi_8_64,
+                                     cospi_24_64, &bl[2], &br[2], &bl[3],
+                                     &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_8_64,
+                                     cospi_24_64, &bl[14], &br[14], &bl[9],
+                                     &br[9]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_24_64, -cospi_8_64, &bl[13], &br[13],
+                                     &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[7], br[7], bl[4], br[4], cospi_4_64,
+                                     cospi_28_64, &al[4], &ar[4], &al[7],
+                                     &ar[7]);
+  butterfly_two_coeff_s32_s64_narrow(bl[6], br[6], bl[5], br[5], cospi_20_64,
+                                     cospi_12_64, &al[5], &ar[5], &al[6],
+                                     &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                                     cospi_28_64, &al[30], &ar[30], &al[17],
+                                     &ar[17]);
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18],
+                                     cospi_28_64, -cospi_4_64, &al[29], &ar[29],
+                                     &al[18], &ar[18]);
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_20_64, cospi_12_64, &al[26], &ar[26],
+                                     &al[21], &ar[21]);
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_12_64, -cospi_20_64, &al[25],
+                                     &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(al[15], ar[15], al[8], ar[8], cospi_2_64,
+                                     cospi_30_64, &bl[8], &br[8], &bl[15],
+                                     &br[15]);
+  butterfly_two_coeff_s32_s64_narrow(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                                     cospi_14_64, &bl[9], &br[9], &bl[14],
+                                     &br[14]);
+  butterfly_two_coeff_s32_s64_narrow(al[13], ar[13], al[10], ar[10],
+                                     cospi_10_64, cospi_22_64, &bl[10], &br[10],
+                                     &bl[13], &br[13]);
+  butterfly_two_coeff_s32_s64_narrow(al[12], ar[12], al[11], ar[11],
+                                     cospi_26_64, cospi_6_64, &bl[11], &br[11],
+                                     &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                                     cospi_31_64, &al[1], &ar[1], &al[31],
+                                     &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[30], br[30], bl[17], br[17],
+                                     cospi_17_64, cospi_15_64, &al[17], &ar[17],
+                                     &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                                     cospi_23_64, &al[9], &ar[9], &al[23],
+                                     &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[28], br[28], bl[19], br[19],
+                                     cospi_25_64, cospi_7_64, &al[25], &ar[25],
+                                     &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                                     cospi_27_64, &al[5], &ar[5], &al[27],
+                                     &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[26], br[26], bl[21], br[21],
+                                     cospi_21_64, cospi_11_64, &al[21], &ar[21],
+                                     &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[25], br[25], bl[22], br[22],
+                                     cospi_13_64, cospi_19_64, &al[13], &ar[13],
+                                     &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32_s64_narrow(bl[24], br[24], bl[23], br[23],
+                                     cospi_29_64, cospi_3_64, &al[29], &ar[29],
+                                     &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+static INLINE void highbd_dct8x32_body_second_pass_rd(int32x4_t *left /*32*/,
+                                                      int32x4_t *right /*32*/) {
+  int32x4_t al[32], ar[32];
+  int32x4_t bl[32], br[32];
+
+  // Stage 1: Done as part of the load.
+
+  // Stage 2.
+  // For the "rd" version, all the values are rounded down after stage 2 to keep
+  // the values in 16 bits.
+  al[0] = add_round_shift_s32(vaddq_s32(left[0], left[15]));
+  ar[0] = add_round_shift_s32(vaddq_s32(right[0], right[15]));
+  al[1] = add_round_shift_s32(vaddq_s32(left[1], left[14]));
+  ar[1] = add_round_shift_s32(vaddq_s32(right[1], right[14]));
+  al[2] = add_round_shift_s32(vaddq_s32(left[2], left[13]));
+  ar[2] = add_round_shift_s32(vaddq_s32(right[2], right[13]));
+  al[3] = add_round_shift_s32(vaddq_s32(left[3], left[12]));
+  ar[3] = add_round_shift_s32(vaddq_s32(right[3], right[12]));
+  al[4] = add_round_shift_s32(vaddq_s32(left[4], left[11]));
+  ar[4] = add_round_shift_s32(vaddq_s32(right[4], right[11]));
+  al[5] = add_round_shift_s32(vaddq_s32(left[5], left[10]));
+  ar[5] = add_round_shift_s32(vaddq_s32(right[5], right[10]));
+  al[6] = add_round_shift_s32(vaddq_s32(left[6], left[9]));
+  ar[6] = add_round_shift_s32(vaddq_s32(right[6], right[9]));
+  al[7] = add_round_shift_s32(vaddq_s32(left[7], left[8]));
+  ar[7] = add_round_shift_s32(vaddq_s32(right[7], right[8]));
+
+  al[8] = add_round_shift_s32(vsubq_s32(left[7], left[8]));
+  ar[8] = add_round_shift_s32(vsubq_s32(right[7], right[8]));
+  al[9] = add_round_shift_s32(vsubq_s32(left[6], left[9]));
+  ar[9] = add_round_shift_s32(vsubq_s32(right[6], right[9]));
+  al[10] = add_round_shift_s32(vsubq_s32(left[5], left[10]));
+  ar[10] = add_round_shift_s32(vsubq_s32(right[5], right[10]));
+  al[11] = add_round_shift_s32(vsubq_s32(left[4], left[11]));
+  ar[11] = add_round_shift_s32(vsubq_s32(right[4], right[11]));
+  al[12] = add_round_shift_s32(vsubq_s32(left[3], left[12]));
+  ar[12] = add_round_shift_s32(vsubq_s32(right[3], right[12]));
+  al[13] = add_round_shift_s32(vsubq_s32(left[2], left[13]));
+  ar[13] = add_round_shift_s32(vsubq_s32(right[2], right[13]));
+  al[14] = add_round_shift_s32(vsubq_s32(left[1], left[14]));
+  ar[14] = add_round_shift_s32(vsubq_s32(right[1], right[14]));
+  al[15] = add_round_shift_s32(vsubq_s32(left[0], left[15]));
+  ar[15] = add_round_shift_s32(vsubq_s32(right[0], right[15]));
+
+  al[16] = add_round_shift_s32(left[16]);
+  ar[16] = add_round_shift_s32(right[16]);
+  al[17] = add_round_shift_s32(left[17]);
+  ar[17] = add_round_shift_s32(right[17]);
+  al[18] = add_round_shift_s32(left[18]);
+  ar[18] = add_round_shift_s32(right[18]);
+  al[19] = add_round_shift_s32(left[19]);
+  ar[19] = add_round_shift_s32(right[19]);
+
+  butterfly_one_coeff_s32_fast(left[27], right[27], left[20], right[20],
+                               cospi_16_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_one_coeff_s32_fast(left[26], right[26], left[21], right[21],
+                               cospi_16_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_one_coeff_s32_fast(left[25], right[25], left[22], right[22],
+                               cospi_16_64, &al[25], &ar[25], &al[22], &ar[22]);
+  butterfly_one_coeff_s32_fast(left[24], right[24], left[23], right[23],
+                               cospi_16_64, &al[24], &ar[24], &al[23], &ar[23]);
+
+  al[20] = add_round_shift_s32(al[20]);
+  ar[20] = add_round_shift_s32(ar[20]);
+  al[21] = add_round_shift_s32(al[21]);
+  ar[21] = add_round_shift_s32(ar[21]);
+  al[22] = add_round_shift_s32(al[22]);
+  ar[22] = add_round_shift_s32(ar[22]);
+  al[23] = add_round_shift_s32(al[23]);
+  ar[23] = add_round_shift_s32(ar[23]);
+  al[24] = add_round_shift_s32(al[24]);
+  ar[24] = add_round_shift_s32(ar[24]);
+  al[25] = add_round_shift_s32(al[25]);
+  ar[25] = add_round_shift_s32(ar[25]);
+  al[26] = add_round_shift_s32(al[26]);
+  ar[26] = add_round_shift_s32(ar[26]);
+  al[27] = add_round_shift_s32(al[27]);
+  ar[27] = add_round_shift_s32(ar[27]);
+
+  al[28] = add_round_shift_s32(left[28]);
+  ar[28] = add_round_shift_s32(right[28]);
+  al[29] = add_round_shift_s32(left[29]);
+  ar[29] = add_round_shift_s32(right[29]);
+  al[30] = add_round_shift_s32(left[30]);
+  ar[30] = add_round_shift_s32(right[30]);
+  al[31] = add_round_shift_s32(left[31]);
+  ar[31] = add_round_shift_s32(right[31]);
+
+  // Stage 3.
+  bl[0] = vaddq_s32(al[0], al[7]);
+  br[0] = vaddq_s32(ar[0], ar[7]);
+  bl[1] = vaddq_s32(al[1], al[6]);
+  br[1] = vaddq_s32(ar[1], ar[6]);
+  bl[2] = vaddq_s32(al[2], al[5]);
+  br[2] = vaddq_s32(ar[2], ar[5]);
+  bl[3] = vaddq_s32(al[3], al[4]);
+  br[3] = vaddq_s32(ar[3], ar[4]);
+
+  bl[4] = vsubq_s32(al[3], al[4]);
+  br[4] = vsubq_s32(ar[3], ar[4]);
+  bl[5] = vsubq_s32(al[2], al[5]);
+  br[5] = vsubq_s32(ar[2], ar[5]);
+  bl[6] = vsubq_s32(al[1], al[6]);
+  br[6] = vsubq_s32(ar[1], ar[6]);
+  bl[7] = vsubq_s32(al[0], al[7]);
+  br[7] = vsubq_s32(ar[0], ar[7]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+  bl[9] = al[9];
+  br[9] = ar[9];
+
+  butterfly_one_coeff_s32_fast(al[13], ar[13], al[10], ar[10], cospi_16_64,
+                               &bl[13], &br[13], &bl[10], &br[10]);
+  butterfly_one_coeff_s32_fast(al[12], ar[12], al[11], ar[11], cospi_16_64,
+                               &bl[12], &br[12], &bl[11], &br[11]);
+
+  bl[14] = al[14];
+  br[14] = ar[14];
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[16], al[23]);
+  br[16] = vaddq_s32(ar[16], ar[23]);
+  bl[17] = vaddq_s32(al[17], al[22]);
+  br[17] = vaddq_s32(ar[17], ar[22]);
+  bl[18] = vaddq_s32(al[18], al[21]);
+  br[18] = vaddq_s32(ar[18], ar[21]);
+  bl[19] = vaddq_s32(al[19], al[20]);
+  br[19] = vaddq_s32(ar[19], ar[20]);
+
+  bl[20] = vsubq_s32(al[19], al[20]);
+  br[20] = vsubq_s32(ar[19], ar[20]);
+  bl[21] = vsubq_s32(al[18], al[21]);
+  br[21] = vsubq_s32(ar[18], ar[21]);
+  bl[22] = vsubq_s32(al[17], al[22]);
+  br[22] = vsubq_s32(ar[17], ar[22]);
+  bl[23] = vsubq_s32(al[16], al[23]);
+  br[23] = vsubq_s32(ar[16], ar[23]);
+
+  bl[24] = vsubq_s32(al[31], al[24]);
+  br[24] = vsubq_s32(ar[31], ar[24]);
+  bl[25] = vsubq_s32(al[30], al[25]);
+  br[25] = vsubq_s32(ar[30], ar[25]);
+  bl[26] = vsubq_s32(al[29], al[26]);
+  br[26] = vsubq_s32(ar[29], ar[26]);
+  bl[27] = vsubq_s32(al[28], al[27]);
+  br[27] = vsubq_s32(ar[28], ar[27]);
+
+  bl[28] = vaddq_s32(al[28], al[27]);
+  br[28] = vaddq_s32(ar[28], ar[27]);
+  bl[29] = vaddq_s32(al[29], al[26]);
+  br[29] = vaddq_s32(ar[29], ar[26]);
+  bl[30] = vaddq_s32(al[30], al[25]);
+  br[30] = vaddq_s32(ar[30], ar[25]);
+  bl[31] = vaddq_s32(al[31], al[24]);
+  br[31] = vaddq_s32(ar[31], ar[24]);
+
+  // Stage 4.
+  al[0] = vaddq_s32(bl[0], bl[3]);
+  ar[0] = vaddq_s32(br[0], br[3]);
+  al[1] = vaddq_s32(bl[1], bl[2]);
+  ar[1] = vaddq_s32(br[1], br[2]);
+  al[2] = vsubq_s32(bl[1], bl[2]);
+  ar[2] = vsubq_s32(br[1], br[2]);
+  al[3] = vsubq_s32(bl[0], bl[3]);
+  ar[3] = vsubq_s32(br[0], br[3]);
+
+  al[4] = bl[4];
+  ar[4] = br[4];
+
+  butterfly_one_coeff_s32_fast(bl[6], br[6], bl[5], br[5], cospi_16_64, &al[6],
+                               &ar[6], &al[5], &ar[5]);
+
+  al[7] = bl[7];
+  ar[7] = br[7];
+
+  al[8] = vaddq_s32(bl[8], bl[11]);
+  ar[8] = vaddq_s32(br[8], br[11]);
+  al[9] = vaddq_s32(bl[9], bl[10]);
+  ar[9] = vaddq_s32(br[9], br[10]);
+  al[10] = vsubq_s32(bl[9], bl[10]);
+  ar[10] = vsubq_s32(br[9], br[10]);
+  al[11] = vsubq_s32(bl[8], bl[11]);
+  ar[11] = vsubq_s32(br[8], br[11]);
+  al[12] = vsubq_s32(bl[15], bl[12]);
+  ar[12] = vsubq_s32(br[15], br[12]);
+  al[13] = vsubq_s32(bl[14], bl[13]);
+  ar[13] = vsubq_s32(br[14], br[13]);
+  al[14] = vaddq_s32(bl[14], bl[13]);
+  ar[14] = vaddq_s32(br[14], br[13]);
+  al[15] = vaddq_s32(bl[15], bl[12]);
+  ar[15] = vaddq_s32(br[15], br[12]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[17] = bl[17];
+  ar[17] = br[17];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_8_64,
+                          cospi_24_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_8_64,
+                          cospi_24_64, &al[28], &ar[28], &al[19], &ar[19]);
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_24_64,
+                          -cospi_8_64, &al[27], &ar[27], &al[20], &ar[20]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_24_64,
+                          -cospi_8_64, &al[26], &ar[26], &al[21], &ar[21]);
+
+  al[22] = bl[22];
+  ar[22] = br[22];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[25] = bl[25];
+  ar[25] = br[25];
+
+  al[30] = bl[30];
+  ar[30] = br[30];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  // Stage 5.
+  butterfly_one_coeff_s32_fast(al[0], ar[0], al[1], ar[1], cospi_16_64, &bl[0],
+                               &br[0], &bl[1], &br[1]);
+  butterfly_two_coeff_s32(al[3], ar[3], al[2], ar[2], cospi_8_64, cospi_24_64,
+                          &bl[2], &br[2], &bl[3], &br[3]);
+
+  bl[4] = vaddq_s32(al[4], al[5]);
+  br[4] = vaddq_s32(ar[4], ar[5]);
+  bl[5] = vsubq_s32(al[4], al[5]);
+  br[5] = vsubq_s32(ar[4], ar[5]);
+  bl[6] = vsubq_s32(al[7], al[6]);
+  br[6] = vsubq_s32(ar[7], ar[6]);
+  bl[7] = vaddq_s32(al[7], al[6]);
+  br[7] = vaddq_s32(ar[7], ar[6]);
+
+  bl[8] = al[8];
+  br[8] = ar[8];
+
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_8_64, cospi_24_64,
+                          &bl[14], &br[14], &bl[9], &br[9]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_24_64,
+                          -cospi_8_64, &bl[13], &br[13], &bl[10], &br[10]);
+
+  bl[11] = al[11];
+  br[11] = ar[11];
+  bl[12] = al[12];
+  br[12] = ar[12];
+
+  bl[15] = al[15];
+  br[15] = ar[15];
+
+  bl[16] = vaddq_s32(al[19], al[16]);
+  br[16] = vaddq_s32(ar[19], ar[16]);
+  bl[17] = vaddq_s32(al[18], al[17]);
+  br[17] = vaddq_s32(ar[18], ar[17]);
+  bl[18] = vsubq_s32(al[17], al[18]);
+  br[18] = vsubq_s32(ar[17], ar[18]);
+  bl[19] = vsubq_s32(al[16], al[19]);
+  br[19] = vsubq_s32(ar[16], ar[19]);
+  bl[20] = vsubq_s32(al[23], al[20]);
+  br[20] = vsubq_s32(ar[23], ar[20]);
+  bl[21] = vsubq_s32(al[22], al[21]);
+  br[21] = vsubq_s32(ar[22], ar[21]);
+  bl[22] = vaddq_s32(al[21], al[22]);
+  br[22] = vaddq_s32(ar[21], ar[22]);
+  bl[23] = vaddq_s32(al[20], al[23]);
+  br[23] = vaddq_s32(ar[20], ar[23]);
+  bl[24] = vaddq_s32(al[27], al[24]);
+  br[24] = vaddq_s32(ar[27], ar[24]);
+  bl[25] = vaddq_s32(al[26], al[25]);
+  br[25] = vaddq_s32(ar[26], ar[25]);
+  bl[26] = vsubq_s32(al[25], al[26]);
+  br[26] = vsubq_s32(ar[25], ar[26]);
+  bl[27] = vsubq_s32(al[24], al[27]);
+  br[27] = vsubq_s32(ar[24], ar[27]);
+  bl[28] = vsubq_s32(al[31], al[28]);
+  br[28] = vsubq_s32(ar[31], ar[28]);
+  bl[29] = vsubq_s32(al[30], al[29]);
+  br[29] = vsubq_s32(ar[30], ar[29]);
+  bl[30] = vaddq_s32(al[29], al[30]);
+  br[30] = vaddq_s32(ar[29], ar[30]);
+  bl[31] = vaddq_s32(al[28], al[31]);
+  br[31] = vaddq_s32(ar[28], ar[31]);
+
+  // Stage 6.
+  al[0] = bl[0];
+  ar[0] = br[0];
+  al[1] = bl[1];
+  ar[1] = br[1];
+  al[2] = bl[2];
+  ar[2] = br[2];
+  al[3] = bl[3];
+  ar[3] = br[3];
+
+  butterfly_two_coeff_s32(bl[7], br[7], bl[4], br[4], cospi_4_64, cospi_28_64,
+                          &al[4], &ar[4], &al[7], &ar[7]);
+  butterfly_two_coeff_s32(bl[6], br[6], bl[5], br[5], cospi_20_64, cospi_12_64,
+                          &al[5], &ar[5], &al[6], &ar[6]);
+
+  al[8] = vaddq_s32(bl[8], bl[9]);
+  ar[8] = vaddq_s32(br[8], br[9]);
+  al[9] = vsubq_s32(bl[8], bl[9]);
+  ar[9] = vsubq_s32(br[8], br[9]);
+  al[10] = vsubq_s32(bl[11], bl[10]);
+  ar[10] = vsubq_s32(br[11], br[10]);
+  al[11] = vaddq_s32(bl[11], bl[10]);
+  ar[11] = vaddq_s32(br[11], br[10]);
+  al[12] = vaddq_s32(bl[12], bl[13]);
+  ar[12] = vaddq_s32(br[12], br[13]);
+  al[13] = vsubq_s32(bl[12], bl[13]);
+  ar[13] = vsubq_s32(br[12], br[13]);
+  al[14] = vsubq_s32(bl[15], bl[14]);
+  ar[14] = vsubq_s32(br[15], br[14]);
+  al[15] = vaddq_s32(bl[15], bl[14]);
+  ar[15] = vaddq_s32(br[15], br[14]);
+
+  al[16] = bl[16];
+  ar[16] = br[16];
+  al[19] = bl[19];
+  ar[19] = br[19];
+  al[20] = bl[20];
+  ar[20] = br[20];
+  al[23] = bl[23];
+  ar[23] = br[23];
+  al[24] = bl[24];
+  ar[24] = br[24];
+  al[27] = bl[27];
+  ar[27] = br[27];
+  al[28] = bl[28];
+  ar[28] = br[28];
+  al[31] = bl[31];
+  ar[31] = br[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_4_64,
+                          cospi_28_64, &al[30], &ar[30], &al[17], &ar[17]);
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_28_64,
+                          -cospi_4_64, &al[29], &ar[29], &al[18], &ar[18]);
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_20_64,
+                          cospi_12_64, &al[26], &ar[26], &al[21], &ar[21]);
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_12_64,
+                          -cospi_20_64, &al[25], &ar[25], &al[22], &ar[22]);
+
+  // Stage 7.
+  bl[0] = al[0];
+  br[0] = ar[0];
+  bl[1] = al[1];
+  br[1] = ar[1];
+  bl[2] = al[2];
+  br[2] = ar[2];
+  bl[3] = al[3];
+  br[3] = ar[3];
+  bl[4] = al[4];
+  br[4] = ar[4];
+  bl[5] = al[5];
+  br[5] = ar[5];
+  bl[6] = al[6];
+  br[6] = ar[6];
+  bl[7] = al[7];
+  br[7] = ar[7];
+
+  butterfly_two_coeff_s32(al[15], ar[15], al[8], ar[8], cospi_2_64, cospi_30_64,
+                          &bl[8], &br[8], &bl[15], &br[15]);
+  butterfly_two_coeff_s32(al[14], ar[14], al[9], ar[9], cospi_18_64,
+                          cospi_14_64, &bl[9], &br[9], &bl[14], &br[14]);
+  butterfly_two_coeff_s32(al[13], ar[13], al[10], ar[10], cospi_10_64,
+                          cospi_22_64, &bl[10], &br[10], &bl[13], &br[13]);
+  butterfly_two_coeff_s32(al[12], ar[12], al[11], ar[11], cospi_26_64,
+                          cospi_6_64, &bl[11], &br[11], &bl[12], &br[12]);
+
+  bl[16] = vaddq_s32(al[16], al[17]);
+  br[16] = vaddq_s32(ar[16], ar[17]);
+  bl[17] = vsubq_s32(al[16], al[17]);
+  br[17] = vsubq_s32(ar[16], ar[17]);
+  bl[18] = vsubq_s32(al[19], al[18]);
+  br[18] = vsubq_s32(ar[19], ar[18]);
+  bl[19] = vaddq_s32(al[19], al[18]);
+  br[19] = vaddq_s32(ar[19], ar[18]);
+  bl[20] = vaddq_s32(al[20], al[21]);
+  br[20] = vaddq_s32(ar[20], ar[21]);
+  bl[21] = vsubq_s32(al[20], al[21]);
+  br[21] = vsubq_s32(ar[20], ar[21]);
+  bl[22] = vsubq_s32(al[23], al[22]);
+  br[22] = vsubq_s32(ar[23], ar[22]);
+  bl[23] = vaddq_s32(al[23], al[22]);
+  br[23] = vaddq_s32(ar[23], ar[22]);
+  bl[24] = vaddq_s32(al[24], al[25]);
+  br[24] = vaddq_s32(ar[24], ar[25]);
+  bl[25] = vsubq_s32(al[24], al[25]);
+  br[25] = vsubq_s32(ar[24], ar[25]);
+  bl[26] = vsubq_s32(al[27], al[26]);
+  br[26] = vsubq_s32(ar[27], ar[26]);
+  bl[27] = vaddq_s32(al[27], al[26]);
+  br[27] = vaddq_s32(ar[27], ar[26]);
+  bl[28] = vaddq_s32(al[28], al[29]);
+  br[28] = vaddq_s32(ar[28], ar[29]);
+  bl[29] = vsubq_s32(al[28], al[29]);
+  br[29] = vsubq_s32(ar[28], ar[29]);
+  bl[30] = vsubq_s32(al[31], al[30]);
+  br[30] = vsubq_s32(ar[31], ar[30]);
+  bl[31] = vaddq_s32(al[31], al[30]);
+  br[31] = vaddq_s32(ar[31], ar[30]);
+
+  // Final stage.
+  left[0] = bl[0];
+  right[0] = br[0];
+  left[16] = bl[1];
+  right[16] = br[1];
+  left[8] = bl[2];
+  right[8] = br[2];
+  left[24] = bl[3];
+  right[24] = br[3];
+  left[4] = bl[4];
+  right[4] = br[4];
+  left[20] = bl[5];
+  right[20] = br[5];
+  left[12] = bl[6];
+  right[12] = br[6];
+  left[28] = bl[7];
+  right[28] = br[7];
+  left[2] = bl[8];
+  right[2] = br[8];
+  left[18] = bl[9];
+  right[18] = br[9];
+  left[10] = bl[10];
+  right[10] = br[10];
+  left[26] = bl[11];
+  right[26] = br[11];
+  left[6] = bl[12];
+  right[6] = br[12];
+  left[22] = bl[13];
+  right[22] = br[13];
+  left[14] = bl[14];
+  right[14] = br[14];
+  left[30] = bl[15];
+  right[30] = br[15];
+
+  butterfly_two_coeff_s32(bl[31], br[31], bl[16], br[16], cospi_1_64,
+                          cospi_31_64, &al[1], &ar[1], &al[31], &ar[31]);
+  left[1] = al[1];
+  right[1] = ar[1];
+  left[31] = al[31];
+  right[31] = ar[31];
+
+  butterfly_two_coeff_s32(bl[30], br[30], bl[17], br[17], cospi_17_64,
+                          cospi_15_64, &al[17], &ar[17], &al[15], &ar[15]);
+  left[17] = al[17];
+  right[17] = ar[17];
+  left[15] = al[15];
+  right[15] = ar[15];
+
+  butterfly_two_coeff_s32(bl[29], br[29], bl[18], br[18], cospi_9_64,
+                          cospi_23_64, &al[9], &ar[9], &al[23], &ar[23]);
+  left[9] = al[9];
+  right[9] = ar[9];
+  left[23] = al[23];
+  right[23] = ar[23];
+
+  butterfly_two_coeff_s32(bl[28], br[28], bl[19], br[19], cospi_25_64,
+                          cospi_7_64, &al[25], &ar[25], &al[7], &ar[7]);
+  left[25] = al[25];
+  right[25] = ar[25];
+  left[7] = al[7];
+  right[7] = ar[7];
+
+  butterfly_two_coeff_s32(bl[27], br[27], bl[20], br[20], cospi_5_64,
+                          cospi_27_64, &al[5], &ar[5], &al[27], &ar[27]);
+  left[5] = al[5];
+  right[5] = ar[5];
+  left[27] = al[27];
+  right[27] = ar[27];
+
+  butterfly_two_coeff_s32(bl[26], br[26], bl[21], br[21], cospi_21_64,
+                          cospi_11_64, &al[21], &ar[21], &al[11], &ar[11]);
+  left[21] = al[21];
+  right[21] = ar[21];
+  left[11] = al[11];
+  right[11] = ar[11];
+
+  butterfly_two_coeff_s32(bl[25], br[25], bl[22], br[22], cospi_13_64,
+                          cospi_19_64, &al[13], &ar[13], &al[19], &ar[19]);
+  left[13] = al[13];
+  right[13] = ar[13];
+  left[19] = al[19];
+  right[19] = ar[19];
+
+  butterfly_two_coeff_s32(bl[24], br[24], bl[23], br[23], cospi_29_64,
+                          cospi_3_64, &al[29], &ar[29], &al[3], &ar[3]);
+  left[29] = al[29];
+  right[29] = ar[29];
+  left[3] = al[3];
+  right[3] = ar[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_VPX_DSP_ARM_FDCT32X32_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct4x4_neon.c
index 2827791f1..3b9196fae 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.c
@@ -18,10 +18,10 @@
 #include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct4x4_neon.h"
 
 void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
-  int i;
   // input[M * stride] * 16
   int16x4_t in[4];
   in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
@@ -34,9 +34,8 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
     in[0] = vadd_s16(in[0], one);
   }
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct4x4_pass1_neon(in);
-  }
+  vpx_fdct4x4_pass1_neon(in);
+  vpx_fdct4x4_pass2_neon(in);
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
@@ -48,3 +47,39 @@ void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
     store_s16q_to_tran_low(final_output + 1 * 8, out_23);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  static const int32x4_t const_1000 = { 1, 0, 0, 0 };
+  const int32x4_t const_one = vdupq_n_s32(1);
+
+  // input[M * stride] * 16
+  int32x4_t in[4];
+  in[0] = vshll_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshll_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshll_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshll_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    in[0] = vaddq_s32(in[0], const_1000);
+  }
+
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  vpx_highbd_fdct4x4_pass1_neon(in);
+  {
+    // Not quite a rounding shift. Only add 1 despite shifting by 2.
+    in[0] = vshrq_n_s32(vaddq_s32(in[0], const_one), 2);
+    in[1] = vshrq_n_s32(vaddq_s32(in[1], const_one), 2);
+    in[2] = vshrq_n_s32(vaddq_s32(in[2], const_one), 2);
+    in[3] = vshrq_n_s32(vaddq_s32(in[3], const_one), 2);
+
+    vst1q_s32(final_output, in[0]);
+    vst1q_s32(final_output + 4, in[1]);
+    vst1q_s32(final_output + 8, in[2]);
+    vst1q_s32(final_output + 12, in[3]);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fdct4x4_neon.h b/libvpx/vpx_dsp/arm/fdct4x4_neon.h
new file mode 100644
index 000000000..de3db9774
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct4x4_neon.h
@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+static INLINE void vpx_fdct4x4_pass2_neon(int16x4_t *in) {
+  int16x4_t out[4];
+
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // fdct_round_shift(s_0 +/- s_1) * cospi_16_64
+  butterfly_one_coeff_s16_s32_fast_narrow_half(s_0, s_1, cospi_16_64, &out[0],
+                                               &out[2]);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_half(s_3, s_2, cospi_8_64, cospi_24_64, &out[1], &out[3]);
+
+  transpose_s16_4x4d(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE void vpx_highbd_fdct4x4_pass1_neon(int32x4_t *in) {
+  int32x4_t out[4];
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int32x4_t s_0 = vaddq_s32(in[0], in[3]);
+  const int32x4_t s_1 = vaddq_s32(in[1], in[2]);
+  const int32x4_t s_2 = vsubq_s32(in[1], in[2]);
+  const int32x4_t s_3 = vsubq_s32(in[0], in[3]);
+
+  butterfly_one_coeff_s32_fast_half(s_0, s_1, cospi_16_64, &out[0], &out[2]);
+
+  // out[1] = s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // out[3] = s_3 * cospi_24_64 - s_2 * cospi_8_64
+  butterfly_two_coeff_s32_s64_narrow_half(s_3, s_2, cospi_8_64, cospi_24_64,
+                                          &out[1], &out[3]);
+
+  transpose_s32_4x4(&out[0], &out[1], &out[2], &out[3]);
+
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT4X4_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.c b/libvpx/vpx_dsp/arm/fdct8x8_neon.c
new file mode 100644
index 000000000..75ee6f223
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.c
@@ -0,0 +1,143 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/fdct8x8_neon.h"
+
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // stage 1
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+
+  vpx_fdct8x8_pass1_neon(in);
+  vpx_fdct8x8_pass2_neon(in);
+  {
+    // from vpx_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
+    // store results
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+                             int stride) {
+  // input[M * stride] * 16
+  int32x4_t left[8], right[8];
+  int16x8_t in[8];
+  in[0] = vld1q_s16(input + 0 * stride);
+  in[1] = vld1q_s16(input + 1 * stride);
+  in[2] = vld1q_s16(input + 2 * stride);
+  in[3] = vld1q_s16(input + 3 * stride);
+  in[4] = vld1q_s16(input + 4 * stride);
+  in[5] = vld1q_s16(input + 5 * stride);
+  in[6] = vld1q_s16(input + 6 * stride);
+  in[7] = vld1q_s16(input + 7 * stride);
+
+  left[0] = vshll_n_s16(vget_low_s16(in[0]), 2);
+  left[1] = vshll_n_s16(vget_low_s16(in[1]), 2);
+  left[2] = vshll_n_s16(vget_low_s16(in[2]), 2);
+  left[3] = vshll_n_s16(vget_low_s16(in[3]), 2);
+  left[4] = vshll_n_s16(vget_low_s16(in[4]), 2);
+  left[5] = vshll_n_s16(vget_low_s16(in[5]), 2);
+  left[6] = vshll_n_s16(vget_low_s16(in[6]), 2);
+  left[7] = vshll_n_s16(vget_low_s16(in[7]), 2);
+  right[0] = vshll_n_s16(vget_high_s16(in[0]), 2);
+  right[1] = vshll_n_s16(vget_high_s16(in[1]), 2);
+  right[2] = vshll_n_s16(vget_high_s16(in[2]), 2);
+  right[3] = vshll_n_s16(vget_high_s16(in[3]), 2);
+  right[4] = vshll_n_s16(vget_high_s16(in[4]), 2);
+  right[5] = vshll_n_s16(vget_high_s16(in[5]), 2);
+  right[6] = vshll_n_s16(vget_high_s16(in[6]), 2);
+  right[7] = vshll_n_s16(vget_high_s16(in[7]), 2);
+
+  vpx_highbd_fdct8x8_pass1_neon(left, right);
+  vpx_highbd_fdct8x8_pass2_neon(left, right);
+  {
+    left[0] = add_round_shift_half_s32(left[0]);
+    left[1] = add_round_shift_half_s32(left[1]);
+    left[2] = add_round_shift_half_s32(left[2]);
+    left[3] = add_round_shift_half_s32(left[3]);
+    left[4] = add_round_shift_half_s32(left[4]);
+    left[5] = add_round_shift_half_s32(left[5]);
+    left[6] = add_round_shift_half_s32(left[6]);
+    left[7] = add_round_shift_half_s32(left[7]);
+    right[0] = add_round_shift_half_s32(right[0]);
+    right[1] = add_round_shift_half_s32(right[1]);
+    right[2] = add_round_shift_half_s32(right[2]);
+    right[3] = add_round_shift_half_s32(right[3]);
+    right[4] = add_round_shift_half_s32(right[4]);
+    right[5] = add_round_shift_half_s32(right[5]);
+    right[6] = add_round_shift_half_s32(right[6]);
+    right[7] = add_round_shift_half_s32(right[7]);
+
+    // store results
+    vst1q_s32(final_output, left[0]);
+    vst1q_s32(final_output + 4, right[0]);
+    vst1q_s32(final_output + 8, left[1]);
+    vst1q_s32(final_output + 12, right[1]);
+    vst1q_s32(final_output + 16, left[2]);
+    vst1q_s32(final_output + 20, right[2]);
+    vst1q_s32(final_output + 24, left[3]);
+    vst1q_s32(final_output + 28, right[3]);
+    vst1q_s32(final_output + 32, left[4]);
+    vst1q_s32(final_output + 36, right[4]);
+    vst1q_s32(final_output + 40, left[5]);
+    vst1q_s32(final_output + 44, right[5]);
+    vst1q_s32(final_output + 48, left[6]);
+    vst1q_s32(final_output + 52, right[6]);
+    vst1q_s32(final_output + 56, left[7]);
+    vst1q_s32(final_output + 60, right[7]);
+  }
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fdct8x8_neon.h b/libvpx/vpx_dsp/arm/fdct8x8_neon.h
new file mode 100644
index 000000000..d8fa60044
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct8x8_neon.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_fast(x[0], x[1], cospi_16_64, &out[0], &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_fast(s[6], s[5], cospi_16_64, &t[1], &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass2_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  int16x8_t s[8], x[4], t[2];
+
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s16_s32_fast_narrow(x[0], x[1], cospi_16_64, &out[0],
+                                          &out[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff(x[3], x[2], cospi_8_64, cospi_24_64, &out[2], &out[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s16_s32_fast_narrow(s[6], s[5], cospi_16_64, &t[1],
+                                          &t[0]);
+
+  // Stage 3
+  x[0] = vaddq_s16(s[4], t[0]);
+  x[1] = vsubq_s16(s[4], t[0]);
+  x[2] = vsubq_s16(s[7], t[1]);
+  x[3] = vaddq_s16(s[7], t[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_4_64, cospi_28_64, &out[1], &out[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_20_64, cospi_12_64, &out[5], &out[3]);
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+static INLINE void vpx_fdct8x8_pass2_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass2_notranspose_neon(in, out);
+  // transpose 8x8
+  transpose_s16_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+  in[0] = out[0];
+  in[1] = out[1];
+  in[2] = out[2];
+  in[3] = out[3];
+  in[4] = out[4];
+  in[5] = out[5];
+  in[6] = out[6];
+  in[7] = out[7];
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void vpx_highbd_fdct8x8_pass1_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[2], xr[2], cospi_8_64, cospi_24_64,
+                          &left[2], &right[2], &left[6], &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32(xl[3], xr[3], xl[0], xr[0], cospi_4_64, cospi_28_64,
+                          &left[1], &right[1], &left[7], &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32(xl[2], xr[2], xl[1], xr[1], cospi_20_64, cospi_12_64,
+                          &left[5], &right[5], &left[3], &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_notranspose_neon(int32x4_t *left,
+                                                             int32x4_t *right) {
+  int32x4_t sl[8], sr[8], xl[4], xr[4], tl[4], tr[4];
+
+  sl[0] = vaddq_s32(left[0], left[7]);
+  sl[1] = vaddq_s32(left[1], left[6]);
+  sl[2] = vaddq_s32(left[2], left[5]);
+  sl[3] = vaddq_s32(left[3], left[4]);
+  sl[4] = vsubq_s32(left[3], left[4]);
+  sl[5] = vsubq_s32(left[2], left[5]);
+  sl[6] = vsubq_s32(left[1], left[6]);
+  sl[7] = vsubq_s32(left[0], left[7]);
+  sr[0] = vaddq_s32(right[0], right[7]);
+  sr[1] = vaddq_s32(right[1], right[6]);
+  sr[2] = vaddq_s32(right[2], right[5]);
+  sr[3] = vaddq_s32(right[3], right[4]);
+  sr[4] = vsubq_s32(right[3], right[4]);
+  sr[5] = vsubq_s32(right[2], right[5]);
+  sr[6] = vsubq_s32(right[1], right[6]);
+  sr[7] = vsubq_s32(right[0], right[7]);
+
+  // fdct4(step, step);
+  // x0 = s0 + s3;
+  xl[0] = vaddq_s32(sl[0], sl[3]);
+  xr[0] = vaddq_s32(sr[0], sr[3]);
+  // x1 = s1 + s2;
+  xl[1] = vaddq_s32(sl[1], sl[2]);
+  xr[1] = vaddq_s32(sr[1], sr[2]);
+  // x2 = s1 - s2;
+  xl[2] = vsubq_s32(sl[1], sl[2]);
+  xr[2] = vsubq_s32(sr[1], sr[2]);
+  // x3 = s0 - s3;
+  xl[3] = vsubq_s32(sl[0], sl[3]);
+  xr[3] = vsubq_s32(sr[0], sr[3]);
+
+  // fdct4(step, step);
+  // out[0] = (tran_low_t)fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[4] = (tran_low_t)fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff_s32_fast(xl[0], xr[0], xl[1], xr[1], cospi_16_64,
+                               &left[0], &right[0], &left[4], &right[4]);
+  // out[2] = (tran_low_t)fdct_round_shift(x2 * cospi_24_64 + x3 * cospi_8_64)
+  // out[6] = (tran_low_t)fdct_round_shift(-x2 * cospi_8_64 + x3 * cospi_24_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[2], xr[2], cospi_8_64,
+                                     cospi_24_64, &left[2], &right[2], &left[6],
+                                     &right[6]);
+
+  // Stage 2
+  // t0 = (s6 - s5) * cospi_16_64;
+  // t1 = (s6 + s5) * cospi_16_64;
+  butterfly_one_coeff_s32_fast(sl[6], sr[6], sl[5], sr[5], cospi_16_64, &tl[1],
+                               &tr[1], &tl[0], &tr[0]);
+
+  // Stage 3
+  xl[0] = vaddq_s32(sl[4], tl[0]);
+  xr[0] = vaddq_s32(sr[4], tr[0]);
+  xl[1] = vsubq_s32(sl[4], tl[0]);
+  xr[1] = vsubq_s32(sr[4], tr[0]);
+  xl[2] = vsubq_s32(sl[7], tl[1]);
+  xr[2] = vsubq_s32(sr[7], tr[1]);
+  xl[3] = vaddq_s32(sl[7], tl[1]);
+  xr[3] = vaddq_s32(sr[7], tr[1]);
+
+  // Stage 4
+  // out[1] = (tran_low_t)fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[7] = (tran_low_t)fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[3], xr[3], xl[0], xr[0], cospi_4_64,
+                                     cospi_28_64, &left[1], &right[1], &left[7],
+                                     &right[7]);
+
+  // out[5] = (tran_low_t)fdct_round_shift(x1 * cospi_12_64 + x2 * cospi_20_64)
+  // out[3] = (tran_low_t)fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff_s32_s64_narrow(xl[2], xr[2], xl[1], xr[1], cospi_20_64,
+                                     cospi_12_64, &left[5], &right[5], &left[3],
+                                     &right[3]);
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass1_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass1_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+static INLINE void vpx_highbd_fdct8x8_pass2_neon(int32x4_t *left,
+                                                 int32x4_t *right) {
+  int32x4x2_t out[8];
+  vpx_highbd_fdct8x8_pass2_notranspose_neon(left, right);
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  left[0] = out[0].val[0];
+  right[0] = out[0].val[1];
+  left[1] = out[1].val[0];
+  right[1] = out[1].val[1];
+  left[2] = out[2].val[0];
+  right[2] = out[2].val[1];
+  left[3] = out[3].val[0];
+  right[3] = out[3].val[1];
+  left[4] = out[4].val[0];
+  right[4] = out[4].val[1];
+  left[5] = out[5].val[0];
+  right[5] = out[5].val[1];
+  left[6] = out[6].val[0];
+  right[6] = out[6].val[1];
+  left[7] = out[7].val[0];
+  right[7] = out[7].val[1];
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VPX_VPX_DSP_ARM_FDCT8X8_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.h b/libvpx/vpx_dsp/arm/fdct_neon.h
index 28d7d86bf..193594e3d 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.h
+++ b/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -13,201 +13,411 @@
 
 #include <arm_neon.h>
 
-static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
-  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
-  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
-
-  // in_0 +/- in_3, in_1 +/- in_2
-  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-  // step_0 +/- step_1, step_2 +/- step_3
-  const int16x4_t s_0 = vget_low_s16(s_01);
-  const int16x4_t s_1 = vget_high_s16(s_01);
-  const int16x4_t s_2 = vget_high_s16(s_32);
-  const int16x4_t s_3 = vget_low_s16(s_32);
-
-  // (s_0 +/- s_1) * cospi_16_64
-  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-  // fdct_round_shift
-  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-  // s_3 * cospi_8_64 + s_2 * cospi_24_64
-  // s_3 * cospi_24_64 - s_2 * cospi_8_64
-  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-  // fdct_round_shift
-  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-  in[0] = out_0;
-  in[1] = out_1;
-  in[2] = out_2;
-  in[3] = out_3;
-}
-
-static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
-                                                      int16x8_t *out) {
-  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
-  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
-  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
-  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
-  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
-  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
-  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
-  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
-  // fdct4(step, step);
-  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-  // fdct4(step, step);
-  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-  }
-  // Stage 2
-  v_x0 = vsubq_s16(v_s6, v_s5);
-  v_x1 = vaddq_s16(v_s6, v_s5);
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x8_t ab = vcombine_s16(a, b);
-    const int16x8_t cd = vcombine_s16(c, d);
-    // Stage 3
-    v_x0 = vaddq_s16(v_s4, ab);
-    v_x1 = vsubq_s16(v_s4, ab);
-    v_x2 = vsubq_s16(v_s7, cd);
-    v_x3 = vaddq_s16(v_s7, cd);
-  }
-  // Stage 4
-  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-  {
-    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-  }
-}
-
-static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
-  int16x8_t out[8];
-  vpx_fdct8x8_pass1_notranspose_neon(in, out);
-  // transpose 8x8
-  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-  // columns.
-  {
-    // 00 01 02 03 40 41 42 43
-    // 10 11 12 13 50 51 52 53
-    // 20 21 22 23 60 61 62 63
-    // 30 31 32 33 70 71 72 73
-    // 04 05 06 07 44 45 46 47
-    // 14 15 16 17 54 55 56 57
-    // 24 25 26 27 64 65 66 67
-    // 34 35 36 37 74 75 76 77
-    const int32x4x2_t r02_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
-    const int32x4x2_t r13_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
-    const int32x4x2_t r46_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
-    const int32x4x2_t r57_s32 =
-        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
-    const int16x8x2_t r01_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                  vreinterpretq_s16_s32(r13_s32.val[0]));
-    const int16x8x2_t r23_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                  vreinterpretq_s16_s32(r13_s32.val[1]));
-    const int16x8x2_t r45_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                  vreinterpretq_s16_s32(r57_s32.val[0]));
-    const int16x8x2_t r67_s16 =
-        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                  vreinterpretq_s16_s32(r57_s32.val[1]));
-    in[0] = r01_s16.val[0];
-    in[1] = r01_s16.val[1];
-    in[2] = r23_s16.val[0];
-    in[3] = r23_s16.val[1];
-    in[4] = r45_s16.val[0];
-    in[5] = r45_s16.val[1];
-    in[6] = r67_s16.val[0];
-    in[7] = r67_s16.val[1];
-    // 00 10 20 30 40 50 60 70
-    // 01 11 21 31 41 51 61 71
-    // 02 12 22 32 42 52 62 72
-    // 03 13 23 33 43 53 63 73
-    // 04 14 24 34 44 54 64 74
-    // 05 15 25 35 45 55 65 75
-    // 06 16 26 36 46 56 66 76
-    // 07 17 27 37 47 57 67 77
-  }
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on half vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast_half(const int16x4_t a,
+                                                     const int16x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int16x4_t *add,
+                                                     int16x4_t *sub) {
+  int16x4_t c = vdup_n_s16(2 * constant);
+  *add = vqrdmulh_s16(vadd_s16(a, b), c);
+  *sub = vqrdmulh_s16(vsub_s16(a, b), c);
 }
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulh_s16 operation on full vector
+// can be slightly less accurate, adequate for pass1
+static INLINE void butterfly_one_coeff_s16_fast(const int16x8_t a,
+                                                const int16x8_t b,
+                                                const tran_coef_t constant,
+                                                int16x8_t *add,
+                                                int16x8_t *sub) {
+  int16x8_t c = vdupq_n_s16(2 * constant);
+  *add = vqrdmulhq_s16(vaddq_s16(a, b), c);
+  *sub = vqrdmulhq_s16(vsubq_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  const int16x4_t a_lo = vget_low_s16(a);
+  const int16x4_t a_hi = vget_high_s16(a);
+  const int16x4_t b_lo = vget_low_s16(b);
+  const int16x4_t b_hi = vget_high_s16(b);
+  *add_lo = vqrdmulhq_s32(vaddl_s16(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddl_s16(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubl_s16(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubl_s16(a_hi, b_hi), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add_lo, add_hi, sub_lo, sub_hi;
+  butterfly_one_coeff_s16_s32_fast(a, b, constant, &add_lo, &add_hi, &sub_lo,
+                                   &sub_hi);
+  *add = vcombine_s16(vmovn_s32(add_lo), vmovn_s32(add_hi));
+  *sub = vcombine_s16(vmovn_s32(sub_lo), vmovn_s32(sub_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns full 32-bit values, high/low
+static INLINE void butterfly_one_coeff_s16_s32_fast_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int32x4_t *add, int32x4_t *sub) {
+  int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddl_s16(a, b), c);
+  *sub = vqrdmulhq_s32(vsubl_s16(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on half vector
+// more accurate does 32-bit processing, takes 16-bit input values,
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_fast_narrow_half(
+    const int16x4_t a, const int16x4_t b, const tran_coef_t constant,
+    int16x4_t *add, int16x4_t *sub) {
+  int32x4_t add32, sub32;
+  butterfly_one_coeff_s16_s32_fast_half(a, b, constant, &add32, &sub32);
+  *add = vmovn_s32(add32);
+  *sub = vmovn_s32(sub32);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int32x4_t *add_lo, int32x4_t *add_hi, int32x4_t *sub_lo,
+    int32x4_t *sub_hi) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), constant);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), constant);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), constant);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), constant);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), constant);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Original Variant that performs normal implementation on full vector
+// fully accurate does 32-bit processing, takes 16-bit values
+// returns narrowed down 16-bit values
+static INLINE void butterfly_one_coeff_s16_s32_narrow(
+    const int16x8_t a, const int16x8_t b, const tran_coef_t constant,
+    int16x8_t *add, int16x8_t *sub) {
+  int32x4_t add32_lo, add32_hi, sub32_lo, sub32_hi;
+  butterfly_one_coeff_s16_s32(a, b, constant, &add32_lo, &add32_hi, &sub32_lo,
+                              &sub32_hi);
+  *add = vcombine_s16(vmovn_s32(add32_lo), vmovn_s32(add32_hi));
+  *sub = vcombine_s16(vmovn_s32(sub32_lo), vmovn_s32(sub32_hi));
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast_half(const int32x4_t a,
+                                                     const int32x4_t b,
+                                                     const tran_coef_t constant,
+                                                     int32x4_t *add,
+                                                     int32x4_t *sub) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add = vqrdmulhq_s32(vaddq_s32(a, b), c);
+  *sub = vqrdmulhq_s32(vsubq_s32(a, b), c);
+}
+
+// fdct_round_shift((a +/- b) * c)
+// Variant that performs fast vqrdmulhq_s32 operation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values,
+// high/low
+static INLINE void butterfly_one_coeff_s32_fast(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant, int32x4_t *add_lo,
+    int32x4_t *add_hi, int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t c = vdupq_n_s32(constant << 17);
+  *add_lo = vqrdmulhq_s32(vaddq_s32(a_lo, b_lo), c);
+  *add_hi = vqrdmulhq_s32(vaddq_s32(a_hi, b_hi), c);
+  *sub_lo = vqrdmulhq_s32(vsubq_s32(a_lo, b_lo), c);
+  *sub_hi = vqrdmulhq_s32(vsubq_s32(a_hi, b_hi), c);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow_half(
+    const int32x4_t a, const int32x4_t b, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add, int32x4_t *sub) {
+  const int32x2_t a_lo = vget_low_s32(a);
+  const int32x2_t a_hi = vget_high_s32(a);
+  const int32x2_t b_lo = vget_low_s32(b);
+  const int32x2_t b_hi = vget_high_s32(b);
+
+  const int64x2_t axc0_64_lo = vmull_n_s32(a_lo, constant1);
+  const int64x2_t axc0_64_hi = vmull_n_s32(a_hi, constant1);
+  const int64x2_t axc1_64_lo = vmull_n_s32(a_lo, constant2);
+  const int64x2_t axc1_64_hi = vmull_n_s32(a_hi, constant2);
+
+  const int64x2_t sum_lo = vmlal_n_s32(axc0_64_lo, b_lo, constant2);
+  const int64x2_t sum_hi = vmlal_n_s32(axc0_64_hi, b_hi, constant2);
+  const int64x2_t diff_lo = vmlsl_n_s32(axc1_64_lo, b_lo, constant1);
+  const int64x2_t diff_hi = vmlsl_n_s32(axc1_64_hi, b_hi, constant1);
+
+  *add = vcombine_s32(vrshrn_n_s64(sum_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(sum_hi, DCT_CONST_BITS));
+  *sub = vcombine_s32(vrshrn_n_s64(diff_lo, DCT_CONST_BITS),
+                      vrshrn_n_s64(diff_hi, DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on full vector
+// more accurate does 64-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_s64_narrow(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  // ac1/ac2 hold the following values:
+  // ac1: vget_low_s32(a_lo) * c1, vget_high_s32(a_lo) * c1,
+  //      vget_low_s32(a_hi) * c1, vget_high_s32(a_hi) * c1
+  // ac2: vget_low_s32(a_lo) * c2, vget_high_s32(a_lo) * c2,
+  //      vget_low_s32(a_hi) * c2, vget_high_s32(a_hi) * c2
+  int64x2_t ac1[4];
+  int64x2_t ac2[4];
+  int64x2_t sum[4];
+  int64x2_t diff[4];
+
+  ac1[0] = vmull_n_s32(vget_low_s32(a_lo), constant1);
+  ac1[1] = vmull_n_s32(vget_high_s32(a_lo), constant1);
+  ac1[2] = vmull_n_s32(vget_low_s32(a_hi), constant1);
+  ac1[3] = vmull_n_s32(vget_high_s32(a_hi), constant1);
+  ac2[0] = vmull_n_s32(vget_low_s32(a_lo), constant2);
+  ac2[1] = vmull_n_s32(vget_high_s32(a_lo), constant2);
+  ac2[2] = vmull_n_s32(vget_low_s32(a_hi), constant2);
+  ac2[3] = vmull_n_s32(vget_high_s32(a_hi), constant2);
+
+  sum[0] = vmlal_n_s32(ac1[0], vget_low_s32(b_lo), constant2);
+  sum[1] = vmlal_n_s32(ac1[1], vget_high_s32(b_lo), constant2);
+  sum[2] = vmlal_n_s32(ac1[2], vget_low_s32(b_hi), constant2);
+  sum[3] = vmlal_n_s32(ac1[3], vget_high_s32(b_hi), constant2);
+  *add_lo = vcombine_s32(vrshrn_n_s64(sum[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[1], DCT_CONST_BITS));
+  *add_hi = vcombine_s32(vrshrn_n_s64(sum[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(sum[3], DCT_CONST_BITS));
+
+  diff[0] = vmlsl_n_s32(ac2[0], vget_low_s32(b_lo), constant1);
+  diff[1] = vmlsl_n_s32(ac2[1], vget_high_s32(b_lo), constant1);
+  diff[2] = vmlsl_n_s32(ac2[2], vget_low_s32(b_hi), constant1);
+  diff[3] = vmlsl_n_s32(ac2[3], vget_high_s32(b_hi), constant1);
+  *sub_lo = vcombine_s32(vrshrn_n_s64(diff[0], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[1], DCT_CONST_BITS));
+  *sub_hi = vcombine_s32(vrshrn_n_s64(diff[2], DCT_CONST_BITS),
+                         vrshrn_n_s64(diff[3], DCT_CONST_BITS));
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s16_s32_noround(
+    const int16x4_t a_lo, const int16x4_t a_hi, const int16x4_t b_lo,
+    const int16x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmull_n_s16(a_lo, constant1);
+  const int32x4_t a2 = vmull_n_s16(a_hi, constant1);
+  const int32x4_t a3 = vmull_n_s16(a_lo, constant2);
+  const int32x4_t a4 = vmull_n_s16(a_hi, constant2);
+  *add_lo = vmlal_n_s16(a1, b_lo, constant2);
+  *add_hi = vmlal_n_s16(a2, b_hi, constant2);
+  *sub_lo = vmlsl_n_s16(a3, b_lo, constant1);
+  *sub_hi = vmlsl_n_s16(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32_noround(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  *add_lo = vmlaq_n_s32(a1, b_lo, constant2);
+  *add_hi = vmlaq_n_s32(a2, b_hi, constant2);
+  *sub_lo = vmlsq_n_s32(a3, b_lo, constant1);
+  *sub_hi = vmlsq_n_s32(a4, b_hi, constant1);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Variant that performs normal implementation on half vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_half(const int16x4_t a,
+                                            const int16x4_t b,
+                                            const tran_coef_t constant1,
+                                            const tran_coef_t constant2,
+                                            int16x4_t *add, int16x4_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(a, constant1);
+  const int32x4_t a2 = vmull_n_s16(a, constant2);
+  const int32x4_t sum = vmlal_n_s16(a1, b, constant2);
+  const int32x4_t diff = vmlsl_n_s16(a2, b, constant1);
+  *add = vqrshrn_n_s32(sum, DCT_CONST_BITS);
+  *sub = vqrshrn_n_s32(diff, DCT_CONST_BITS);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 16-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t constant1,
+                                       const tran_coef_t constant2,
+                                       int16x8_t *add, int16x8_t *sub) {
+  const int32x4_t a1 = vmull_n_s16(vget_low_s16(a), constant1);
+  const int32x4_t a2 = vmull_n_s16(vget_high_s16(a), constant1);
+  const int32x4_t a3 = vmull_n_s16(vget_low_s16(a), constant2);
+  const int32x4_t a4 = vmull_n_s16(vget_high_s16(a), constant2);
+  const int32x4_t sum0 = vmlal_n_s16(a1, vget_low_s16(b), constant2);
+  const int32x4_t sum1 = vmlal_n_s16(a2, vget_high_s16(b), constant2);
+  const int32x4_t diff0 = vmlsl_n_s16(a3, vget_low_s16(b), constant1);
+  const int32x4_t diff1 = vmlsl_n_s16(a4, vget_high_s16(b), constant1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, DCT_CONST_BITS);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, DCT_CONST_BITS);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, DCT_CONST_BITS);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, DCT_CONST_BITS);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c1 +/- b * c2)
+// Original Variant that performs normal implementation on full vector
+// more accurate does 32-bit processing, takes and returns 32-bit values
+// returns narrowed results
+static INLINE void butterfly_two_coeff_s32(
+    const int32x4_t a_lo, const int32x4_t a_hi, const int32x4_t b_lo,
+    const int32x4_t b_hi, const tran_coef_t constant1,
+    const tran_coef_t constant2, int32x4_t *add_lo, int32x4_t *add_hi,
+    int32x4_t *sub_lo, int32x4_t *sub_hi) {
+  const int32x4_t a1 = vmulq_n_s32(a_lo, constant1);
+  const int32x4_t a2 = vmulq_n_s32(a_hi, constant1);
+  const int32x4_t a3 = vmulq_n_s32(a_lo, constant2);
+  const int32x4_t a4 = vmulq_n_s32(a_hi, constant2);
+  const int32x4_t sum0 = vmlaq_n_s32(a1, b_lo, constant2);
+  const int32x4_t sum1 = vmlaq_n_s32(a2, b_hi, constant2);
+  const int32x4_t diff0 = vmlsq_n_s32(a3, b_lo, constant1);
+  const int32x4_t diff1 = vmlsq_n_s32(a4, b_hi, constant1);
+  *add_lo = vrshrq_n_s32(sum0, DCT_CONST_BITS);
+  *add_hi = vrshrq_n_s32(sum1, DCT_CONST_BITS);
+  *sub_lo = vrshrq_n_s32(diff0, DCT_CONST_BITS);
+  *sub_hi = vrshrq_n_s32(diff1, DCT_CONST_BITS);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int16x8_t add_round_shift_s16(const int16x8_t a) {
+  const int16x8_t one = vdupq_n_s16(1);
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vshrq_n_s16(vaddq_s16(vaddq_s16(a, a_sign_s16), one), 2);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift and round,
+// return narrowed results
+static INLINE int16x8_t add_round_shift_s32_narrow(const int32x4_t a_lo,
+                                                   const int32x4_t a_hi) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_lo_u32 = vreinterpretq_u32_s32(a_lo);
+  const uint32x4_t a_lo_sign_u32 = vshrq_n_u32(a_lo_u32, 31);
+  const int32x4_t a_lo_sign_s32 = vreinterpretq_s32_u32(a_lo_sign_u32);
+  const int16x4_t b_lo =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_lo, a_lo_sign_s32), one), 2);
+  const uint32x4_t a_hi_u32 = vreinterpretq_u32_s32(a_hi);
+  const uint32x4_t a_hi_sign_u32 = vshrq_n_u32(a_hi_u32, 31);
+  const int32x4_t a_hi_sign_s32 = vreinterpretq_s32_u32(a_hi_sign_u32);
+  const int16x4_t b_hi =
+      vshrn_n_s32(vqaddq_s32(vqaddq_s32(a_hi, a_hi_sign_s32), one), 2);
+  return vcombine_s16(b_lo, b_hi);
+}
+
+// Add 1 if negative, and shift by 1.
+// In practice, add the sign bit, then shift and round
+static INLINE int32x4_t add_round_shift_half_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(a, a_sign_s32), 1);
+}
+
+// Add 1 if positive, 2 if negative, and shift by 2.
+// In practice, add 1, then add the sign bit, then shift without rounding.
+static INLINE int32x4_t add_round_shift_s32(const int32x4_t a) {
+  const int32x4_t one = vdupq_n_s32(1);
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vshrq_n_s32(vaddq_s32(vaddq_s32(a, a_sign_s32), one), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int16x8_t sub_round_shift_s16(const int16x8_t a) {
+  const uint16x8_t a_u16 = vreinterpretq_u16_s16(a);
+  const uint16x8_t a_sign_u16 = vshrq_n_u16(a_u16, 15);
+  const int16x8_t a_sign_s16 = vreinterpretq_s16_u16(a_sign_u16);
+  return vrshrq_n_s16(vsubq_s16(a, a_sign_s16), 2);
+}
+
+// Add 2 if positive, 1 if negative, and shift by 2.
+// In practice, subtract the sign bit, then shift with rounding.
+static INLINE int32x4_t sub_round_shift_s32(const int32x4_t a) {
+  const uint32x4_t a_u32 = vreinterpretq_u32_s32(a);
+  const uint32x4_t a_sign_u32 = vshrq_n_u32(a_u32, 31);
+  const int32x4_t a_sign_s32 = vreinterpretq_s32_u32(a_sign_u32);
+  return vrshrq_n_s32(vsubq_s32(a, a_sign_s32), 2);
+}
+
 #endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct_partial_neon.c b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
index 0a1cdca41..718dba0d9 100644
--- a/libvpx/vpx_dsp/arm/fdct_partial_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct_partial_neon.c
@@ -101,3 +101,68 @@ void vpx_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
   output[0] = (tran_low_t)(sum >> 3);
   output[1] = 0;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_highbd_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a = vld1q_s16(input);
+    const int16x8_t b = vld1q_s16(input + 8);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(b));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(b));
+    r++;
+  } while (r < 16);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 1);
+  output[1] = 0;
+}
+
+void vpx_highbd_fdct32x32_1_neon(const int16_t *input, tran_low_t *output,
+                                 int stride) {
+  int32x4_t partial_sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                               vdupq_n_s32(0) };
+
+  int32_t sum;
+
+  int r = 0;
+  do {
+    const int16x8_t a0 = vld1q_s16(input);
+    const int16x8_t a1 = vld1q_s16(input + 8);
+    const int16x8_t a2 = vld1q_s16(input + 16);
+    const int16x8_t a3 = vld1q_s16(input + 24);
+    input += stride;
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_low_s16(a0));
+    partial_sum[0] = vaddw_s16(partial_sum[0], vget_high_s16(a0));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_low_s16(a1));
+    partial_sum[1] = vaddw_s16(partial_sum[1], vget_high_s16(a1));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_low_s16(a2));
+    partial_sum[2] = vaddw_s16(partial_sum[2], vget_high_s16(a2));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_low_s16(a3));
+    partial_sum[3] = vaddw_s16(partial_sum[3], vget_high_s16(a3));
+    r++;
+  } while (r < 32);
+
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[1]);
+  partial_sum[2] = vaddq_s32(partial_sum[2], partial_sum[3]);
+  partial_sum[0] = vaddq_s32(partial_sum[0], partial_sum[2]);
+  sum = horizontal_add_int32x4(partial_sum[0]);
+
+  output[0] = (tran_low_t)(sum >> 3);
+  output[1] = 0;
+}
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
deleted file mode 100644
index d9161c6d3..000000000
--- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_dsp/arm/idct_neon.h"
-#include "vpx_dsp/arm/fdct_neon.h"
-#include "vpx_dsp/arm/mem_neon.h"
-
-void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
-                      int stride) {
-  int i;
-  // stage 1
-  int16x8_t in[8];
-  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
-  for (i = 0; i < 2; ++i) {
-    vpx_fdct8x8_pass1_neon(in);
-  }  // for
-  {
-    // from vpx_dct_sse2.c
-    // Post-condition (division by two)
-    //    division of two 16 bits signed numbers using shifts
-    //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
-    in[0] = vhsubq_s16(in[0], sign_in0);
-    in[1] = vhsubq_s16(in[1], sign_in1);
-    in[2] = vhsubq_s16(in[2], sign_in2);
-    in[3] = vhsubq_s16(in[3], sign_in3);
-    in[4] = vhsubq_s16(in[4], sign_in4);
-    in[5] = vhsubq_s16(in[5], sign_in5);
-    in[6] = vhsubq_s16(in[6], sign_in6);
-    in[7] = vhsubq_s16(in[7], sign_in7);
-    // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
-    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
-    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
-    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
-    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
-    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
-    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
-    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
-  }
-}
diff --git a/libvpx/vpx_dsp/arm/hadamard_neon.c b/libvpx/vpx_dsp/arm/hadamard_neon.c
index 523a63c6f..f6b6d7e3c 100644
--- a/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -114,3 +114,45 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
     coeff += 8;
   }
 }
+
+void vpx_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  int i;
+
+  /* Rearrange 32x32 to 16x64 and remove stride.
+   * Top left first. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+                          coeff + 256);
+  /* Bottom left. */
+  vpx_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+                          coeff + 512);
+  /* Bottom right. */
+  vpx_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+                          coeff + 768);
+
+  for (i = 0; i < 256; i += 8) {
+    const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+    const int16x8_t a1 = load_tran_low_to_s16q(coeff + 256);
+    const int16x8_t a2 = load_tran_low_to_s16q(coeff + 512);
+    const int16x8_t a3 = load_tran_low_to_s16q(coeff + 768);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vhaddq_s16(b0, b2);
+    const int16x8_t c1 = vhaddq_s16(b1, b3);
+    const int16x8_t c2 = vhsubq_s16(b0, b2);
+    const int16x8_t c3 = vhsubq_s16(b1, b3);
+
+    store_s16q_to_tran_low(coeff + 0, c0);
+    store_s16q_to_tran_low(coeff + 256, c1);
+    store_s16q_to_tran_low(coeff + 512, c2);
+    store_s16q_to_tran_low(coeff + 768, c3);
+
+    coeff += 8;
+  }
+}
diff --git a/libvpx/vpx_dsp/arm/highbd_quantize_neon.c b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 000000000..b9f72a94c
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,307 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/mem_neon.h"
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store(
+    const int32x4_t dqcoeff_0, const int32x4_t dqcoeff_1,
+    tran_low_t *dqcoeff_ptr) {
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE void highbd_quantize_8_neon(
+    const int32x4_t coeff_0, const int32x4_t coeff_1, const int32x4_t zbin,
+    const int32x4_t round, const int32x4_t quant, const int32x4_t quant_shift,
+    int32x4_t *qcoeff_0, int32x4_t *qcoeff_1) {
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0_sign = vshrq_n_s32(coeff_0, 31);
+  const int32x4_t coeff_1_sign = vshrq_n_s32(coeff_1, 31);
+  const int32x4_t coeff_0_abs = vabsq_s32(coeff_0);
+  const int32x4_t coeff_1_abs = vabsq_s32(coeff_1);
+
+  // Calculate 2 masks of elements outside the bin
+  const int32x4_t zbin_mask_0 =
+      vreinterpretq_s32_u32(vcgeq_s32(coeff_0_abs, zbin));
+  const int32x4_t zbin_mask_1 = vreinterpretq_s32_u32(
+      vcgeq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(zbin), 1)));
+
+  // Get the rounded values
+  const int32x4_t rounded_0 = vaddq_s32(coeff_0_abs, round);
+  const int32x4_t rounded_1 =
+      vaddq_s32(coeff_1_abs, vdupq_lane_s32(vget_low_s32(round), 1));
+
+  // (round * (quant << 15) * 2) >> 16 == (round * quant)
+  int32x4_t qcoeff_tmp_0 = vqdmulhq_s32(rounded_0, quant);
+  int32x4_t qcoeff_tmp_1 =
+      vqdmulhq_s32(rounded_1, vdupq_lane_s32(vget_low_s32(quant), 1));
+
+  // Add rounded values
+  qcoeff_tmp_0 = vaddq_s32(qcoeff_tmp_0, rounded_0);
+  qcoeff_tmp_1 = vaddq_s32(qcoeff_tmp_1, rounded_1);
+
+  // (round * (quant_shift << 15) * 2) >> 16 == (round * quant_shift)
+  qcoeff_tmp_0 = vqdmulhq_s32(qcoeff_tmp_0, quant_shift);
+  qcoeff_tmp_1 =
+      vqdmulhq_s32(qcoeff_tmp_1, vdupq_lane_s32(vget_low_s32(quant_shift), 1));
+
+  // Restore the sign bit.
+  qcoeff_tmp_0 = veorq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = veorq_s32(qcoeff_tmp_1, coeff_1_sign);
+  qcoeff_tmp_0 = vsubq_s32(qcoeff_tmp_0, coeff_0_sign);
+  qcoeff_tmp_1 = vsubq_s32(qcoeff_tmp_1, coeff_1_sign);
+
+  // Only keep the relevant coeffs
+  *qcoeff_0 = vandq_s32(qcoeff_tmp_0, zbin_mask_0);
+  *qcoeff_1 = vandq_s32(qcoeff_tmp_1, zbin_mask_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t
+highbd_quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr, const int32x4_t zbin,
+                       const int32x4_t round, const int32x4_t quant,
+                       const int32x4_t quant_shift, const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vmovl_s16(vld1_s16(zbin_ptr));
+  int32x4_t round = vmovl_s16(vld1_s16(round_ptr));
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 15);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                               quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  n_coeffs -= 8;
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    do {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                 round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+      n_coeffs -= 8;
+    } while (n_coeffs > 0);
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
+}
+
+static VPX_FORCE_INLINE int32x4_t extract_sign_bit(int32x4_t a) {
+  return vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(a), 31));
+}
+
+static VPX_FORCE_INLINE void highbd_calculate_dqcoeff_and_store_32x32(
+    int32x4_t dqcoeff_0, int32x4_t dqcoeff_1, tran_low_t *dqcoeff_ptr) {
+  // Add 1 if negative to round towards zero because the C uses division.
+  dqcoeff_0 = vaddq_s32(dqcoeff_0, extract_sign_bit(dqcoeff_0));
+  dqcoeff_1 = vaddq_s32(dqcoeff_1, extract_sign_bit(dqcoeff_1));
+
+  dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
+  dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
+}
+
+static VPX_FORCE_INLINE int16x8_t highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int32x4_t zbin, const int32x4_t round,
+    const int32x4_t quant, const int32x4_t quant_shift,
+    const int32x4_t dequant) {
+  int32x4_t qcoeff_0, qcoeff_1, dqcoeff_0, dqcoeff_1;
+
+  // Load coeffs as 2 vectors of 4 x 32-bit ints each, take sign and abs values
+  const int32x4_t coeff_0 = vld1q_s32(coeff_ptr);
+  const int32x4_t coeff_1 = vld1q_s32(coeff_ptr + 4);
+  highbd_quantize_8_neon(coeff_0, coeff_1, zbin, round, quant, quant_shift,
+                         &qcoeff_0, &qcoeff_1);
+
+  // Store the 32-bit qcoeffs
+  vst1q_s32(qcoeff_ptr, qcoeff_0);
+  vst1q_s32(qcoeff_ptr + 4, qcoeff_1);
+
+  // Calculate and store the dqcoeffs
+  dqcoeff_0 = vmulq_s32(qcoeff_0, dequant);
+  dqcoeff_1 = vmulq_s32(qcoeff_1, vdupq_lane_s32(vget_low_s32(dequant), 1));
+
+  highbd_calculate_dqcoeff_and_store_32x32(dqcoeff_0, dqcoeff_1, dqcoeff_ptr);
+
+  return vcombine_s16(vmovn_s32(qcoeff_0), vmovn_s32(qcoeff_1));
+}
+
+void vpx_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const int16x8_t neg_one = vdupq_n_s16(-1);
+  uint16x8_t eob_max;
+  int i;
+
+  // Only the first element of each vector is DC.
+  // High half has identical elements, but we can reconstruct it from the low
+  // half by duplicating the 2nd element. So we only need to pass a 4x32-bit
+  // vector
+  int32x4_t zbin = vrshrq_n_s32(vmovl_s16(vld1_s16(zbin_ptr)), 1);
+  int32x4_t round = vrshrq_n_s32(vmovl_s16(vld1_s16(round_ptr)), 1);
+  // Extend the quant, quant_shift vectors to ones of 32-bit elements
+  // scale to high-half, so we can use vqdmulhq_s32
+  int32x4_t quant = vshlq_n_s32(vmovl_s16(vld1_s16(quant_ptr)), 15);
+  int32x4_t quant_shift = vshlq_n_s32(vmovl_s16(vld1_s16(quant_shift_ptr)), 16);
+  int32x4_t dequant = vmovl_s16(vld1_s16(dequant_ptr));
+
+  // Process first 8 values which include a dc component.
+  {
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+    const int16x8_t qcoeff =
+        highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                     round, quant, quant_shift, dequant);
+
+    // Set non-zero elements to -1 and use that to extract values for eob.
+    eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
+
+    __builtin_prefetch(coeff_ptr + 64);
+    coeff_ptr += 8;
+    iscan += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+  }
+
+  {
+    zbin = vdupq_lane_s32(vget_low_s32(zbin), 1);
+    round = vdupq_lane_s32(vget_low_s32(round), 1);
+    quant = vdupq_lane_s32(vget_low_s32(quant), 1);
+    quant_shift = vdupq_lane_s32(vget_low_s32(quant_shift), 1);
+    dequant = vdupq_lane_s32(vget_low_s32(dequant), 1);
+
+    for (i = 1; i < 32 * 32 / 8; ++i) {
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
+
+      const int16x8_t qcoeff =
+          highbd_quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin,
+                                       round, quant, quant_shift, dequant);
+
+      // Set non-zero elements to -1 and use that to extract values for eob.
+      eob_max =
+          vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
+
+      __builtin_prefetch(coeff_ptr + 64);
+      coeff_ptr += 8;
+      iscan += 8;
+      qcoeff_ptr += 8;
+      dqcoeff_ptr += 8;
+    }
+  }
+
+#ifdef __aarch64__
+  *eob_ptr = vmaxvq_u16(eob_max);
+#else
+  {
+    const uint16x4_t eob_max_0 =
+        vmax_u16(vget_low_u16(eob_max), vget_high_u16(eob_max));
+    const uint16x4_t eob_max_1 = vpmax_u16(eob_max_0, eob_max_0);
+    const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
+    vst1_lane_u16(eob_ptr, eob_max_2, 0);
+  }
+#endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
+}
diff --git a/libvpx/vpx_dsp/arm/highbd_sad_neon.c b/libvpx/vpx_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 000000000..ecb52ce5a
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_neon(const uint8_t *src_ptr,
+                                                  int src_stride,
+                                                  const uint8_t *ref_ptr,
+                                                  int ref_stride, int width,
+                                                  int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 4) {
+      const uint16x4_t src_u16 = vld1_u16(src16_ptr + j);
+      const uint16x4_t ref_u16 = vld1_u16(ref16_ptr + j);
+      sum_abs_diff = vabal_u16(sum_abs_diff, src_u16, ref_u16);
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_neon(const uint8_t *src_ptr,
+                                                  int src_stride,
+                                                  const uint8_t *ref_ptr,
+                                                  int ref_stride, int width,
+                                                  int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 8) {
+      const uint16x8_t src_u16 = vld1q_u16(src16_ptr + j);
+      const uint16x8_t ref_u16 = vld1q_u16(ref16_ptr + j);
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_low_u16(src_u16), vget_low_u16(ref_u16));
+      sum_abs_diff = vabal_u16(sum_abs_diff, vget_high_u16(src_u16),
+                               vget_high_u16(ref_u16));
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad4_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, const uint8_t *second_pred, int width, int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 4) {
+      const uint16x4_t a_u16 = vld1_u16(src16_ptr + j);
+      const uint16x4_t b_u16 = vld1_u16(ref16_ptr + j);
+      const uint16x4_t c_u16 = vld1_u16(pred_ptr + j);
+      const uint16x4_t avg = vrhadd_u16(b_u16, c_u16);
+      sum_abs_diff = vabal_u16(sum_abs_diff, a_u16, avg);
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred_ptr += width;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+static VPX_FORCE_INLINE uint32_t highbd_sad8_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, const uint8_t *second_pred, int width, int height) {
+  int i, j;
+  uint32x4_t sum_abs_diff = vdupq_n_u32(0);
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j += 8) {
+      const uint16x8_t a_u16 = vld1q_u16(src16_ptr + j);
+      const uint16x8_t b_u16 = vld1q_u16(ref16_ptr + j);
+      const uint16x8_t c_u16 = vld1q_u16(pred_ptr + j);
+      const uint16x8_t avg = vrhaddq_u16(b_u16, c_u16);
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_low_u16(a_u16), vget_low_u16(avg));
+      sum_abs_diff =
+          vabal_u16(sum_abs_diff, vget_high_u16(a_u16), vget_high_u16(avg));
+    }
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred_ptr += width;
+  }
+
+  return horizontal_add_uint32x4(sum_abs_diff);
+}
+
+#define highbd_sad4MxN(m, n)                                                 \
+  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return highbd_sad4_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+  }
+
+#define highbd_sadMxN(m, n)                                                  \
+  unsigned int vpx_highbd_sad##m##x##n##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return highbd_sad8_neon(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \
+  }
+
+#define highbd_sad4MxN_avg(m, n)                                          \
+  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, const uint8_t *second_pred) {                       \
+    return highbd_sad4_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+                                second_pred, m, n);                       \
+  }
+
+#define highbd_sadMxN_avg(m, n)                                           \
+  unsigned int vpx_highbd_sad##m##x##n##_avg_neon(                        \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, const uint8_t *second_pred) {                       \
+    return highbd_sad8_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, \
+                                second_pred, m, n);                       \
+  }
+
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_neon(                                      \
+      const uint8_t *src_ptr, int src_stride,                                  \
+      const uint8_t *const ref_array[4], int ref_stride,                       \
+      uint32_t sad_array[4]) {                                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_neon(src_ptr, src_stride,       \
+                                                    ref_array[i], ref_stride); \
+    }                                                                          \
+  }
+
+/* clang-format off */
+// 4x4
+highbd_sad4MxN(4, 4)
+highbd_sad4MxN_avg(4, 4)
+highbd_sadMxNx4D(4, 4)
+
+// 4x8
+highbd_sad4MxN(4, 8)
+highbd_sad4MxN_avg(4, 8)
+highbd_sadMxNx4D(4, 8)
+
+// 8x4
+highbd_sadMxN(8, 4)
+highbd_sadMxN_avg(8, 4)
+highbd_sadMxNx4D(8, 4)
+
+// 8x8
+highbd_sadMxN(8, 8)
+highbd_sadMxN_avg(8, 8)
+highbd_sadMxNx4D(8, 8)
+
+// 8x16
+highbd_sadMxN(8, 16)
+highbd_sadMxN_avg(8, 16)
+highbd_sadMxNx4D(8, 16)
+
+// 16x8
+highbd_sadMxN(16, 8)
+highbd_sadMxN_avg(16, 8)
+highbd_sadMxNx4D(16, 8)
+
+// 16x16
+highbd_sadMxN(16, 16)
+highbd_sadMxN_avg(16, 16)
+highbd_sadMxNx4D(16, 16)
+
+// 16x32
+highbd_sadMxN(16, 32)
+highbd_sadMxN_avg(16, 32)
+highbd_sadMxNx4D(16, 32)
+
+// 32x16
+highbd_sadMxN(32, 16)
+highbd_sadMxN_avg(32, 16)
+highbd_sadMxNx4D(32, 16)
+
+// 32x32
+highbd_sadMxN(32, 32)
+highbd_sadMxN_avg(32, 32)
+highbd_sadMxNx4D(32, 32)
+
+// 32x64
+highbd_sadMxN(32, 64)
+highbd_sadMxN_avg(32, 64)
+highbd_sadMxNx4D(32, 64)
+
+// 64x32
+highbd_sadMxN(64, 32)
+highbd_sadMxN_avg(64, 32)
+highbd_sadMxNx4D(64, 32)
+
+// 64x64
+highbd_sadMxN(64, 64)
+highbd_sadMxN_avg(64, 64)
+highbd_sadMxNx4D(64, 64)
+    /* clang-format on */
diff --git a/libvpx/vpx_dsp/arm/highbd_variance_neon.c b/libvpx/vpx_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 000000000..96a35af01
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,496 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/sum_neon.h"
+#include "vpx_ports/mem.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+static INLINE void highbd_variance16(const uint16_t *src_ptr, int src_stride,
+                                     const uint16_t *ref_ptr, int ref_stride,
+                                     int w, int h, uint64_t *sse,
+                                     int64_t *sum) {
+  int i, j;
+
+  if (w >= 8) {
+    int32x4_t sum_s32 = vdupq_n_s32(0);
+    uint32x4_t sse_u32 = vdupq_n_u32(0);
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const int16x8_t src_s16 = vreinterpretq_s16_u16(vld1q_u16(&src_ptr[j]));
+        const int16x8_t ref_s16 = vreinterpretq_s16_u16(vld1q_u16(&ref_ptr[j]));
+        const int32x4_t diff1_s32 =
+            vsubl_s16(vget_low_s16(src_s16), vget_low_s16(ref_s16));
+        const int32x4_t diff2_s32 =
+            vsubl_s16(vget_high_s16(src_s16), vget_high_s16(ref_s16));
+        const uint32x4_t diff1_u32 = vreinterpretq_u32_s32(diff1_s32);
+        const uint32x4_t diff2_u32 = vreinterpretq_u32_s32(diff2_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff1_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff2_s32);
+        sse_u32 = vmlaq_u32(sse_u32, diff1_u32, diff1_u32);
+        sse_u32 = vmlaq_u32(sse_u32, diff2_u32, diff2_u32);
+      }
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+    }
+    *sum = horizontal_add_int32x4(sum_s32);
+    *sse = horizontal_add_uint32x4(sse_u32);
+  } else {
+    int32x4_t sum_s32 = vdupq_n_s32(0);
+    uint32x4_t sse_u32 = vdupq_n_u32(0);
+    assert(w >= 4);
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 4) {
+        const int16x4_t src_s16 = vreinterpret_s16_u16(vld1_u16(&src_ptr[j]));
+        const int16x4_t ref_s16 = vreinterpret_s16_u16(vld1_u16(&ref_ptr[j]));
+        const int32x4_t diff_s32 = vsubl_s16(src_s16, ref_s16);
+        const uint32x4_t diff_u32 = vreinterpretq_u32_s32(diff_s32);
+        sum_s32 = vaddq_s32(sum_s32, diff_s32);
+        sse_u32 = vmlaq_u32(sse_u32, diff_u32, diff_u32);
+      }
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+    }
+    *sum = horizontal_add_int32x4(sum_s32);
+    *sse = horizontal_add_uint32x4(sse_u32);
+  }
+}
+
+static INLINE void highbd_variance64(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride,
+                                     int w, int h, uint64_t *sse,
+                                     int64_t *sum) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref8_ptr);
+
+  if (w < 32 && h < 32) {
+    highbd_variance16(src_ptr, src_stride, ref_ptr, ref_stride, w, h, sse, sum);
+  } else {
+    uint64_t sse_long = 0;
+    int64_t sum_long = 0;
+    int k, l;
+    for (k = 0; k + 16 <= h; k += 16) {
+      for (l = 0; l + 16 <= w; l += 16) {
+        uint64_t sse_tmp = 0;
+        int64_t sum_tmp = 0;
+        highbd_variance16(src_ptr + l, src_stride, ref_ptr + l, ref_stride, 16,
+                          16, &sse_tmp, &sum_tmp);
+        sum_long += sum_tmp;
+        sse_long += sse_tmp;
+      }
+      src_ptr += 16 * src_stride;
+      ref_ptr += 16 * ref_stride;
+    }
+    *sum = sum_long;
+    *sse = sse_long;
+  }
+}
+
+static INLINE void highbd_8_variance(const uint8_t *src8_ptr, int src_stride,
+                                     const uint8_t *ref8_ptr, int ref_stride,
+                                     int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static INLINE void highbd_10_variance(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride,
+                                      int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static INLINE void highbd_12_variance(const uint8_t *src8_ptr, int src_stride,
+                                      const uint8_t *ref8_ptr, int ref_stride,
+                                      int w, int h, uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(src8_ptr, src_stride, ref8_ptr, ref_stride, w, h, &sse_long,
+                    &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H)                                                    \
+  uint32_t vpx_highbd_8_variance##W##x##H##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_variance##W##x##H##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_variance##W##x##H##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+#define HIGHBD_GET_VAR(S)                                                   \
+  void vpx_highbd_8_get##S##x##S##var_neon(                                 \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse,  \
+                      sum);                                                 \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_10_get##S##x##S##var_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }                                                                         \
+                                                                            \
+  void vpx_highbd_12_get##S##x##S##var_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse, int *sum) {                            \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, S, S, sse, \
+                       sum);                                                \
+  }
+
+#define HIGHBD_MSE(W, H)                                                    \
+  uint32_t vpx_highbd_8_mse##W##x##H##_neon(                                \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_8_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse,  \
+                      &sum);                                                \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_10_mse##W##x##H##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_10_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }                                                                         \
+                                                                            \
+  uint32_t vpx_highbd_12_mse##W##x##H##_neon(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,       \
+      int ref_stride, uint32_t *sse) {                                      \
+    int sum;                                                                \
+    highbd_12_variance(src_ptr, src_stride, ref_ptr, ref_stride, W, H, sse, \
+                       &sum);                                               \
+    return *sse;                                                            \
+  }
+
+static INLINE void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  uint32_t i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+
+  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
+        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  } else {
+    assert(output_width >= 4);
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 4) {
+        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint16x4_t out_u16;
+        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+        vst1_u16(&output_ptr[j], out_u16);
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  }
+}
+
+static INLINE void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  uint32_t i, j;
+
+  uint32x4_t round_u32 = vshlq_n_u32(vdupq_n_u32(1), FILTER_BITS - 1);
+  uint16x4_t filter1_u16 = vdup_n_u16(filter[0]);
+  uint16x4_t filter2_u16 = vdup_n_u16(filter[1]);
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        const uint16x8_t src1_u16 = vld1q_u16(&src_ptr[j]);
+        const uint16x8_t src2_u16 = vld1q_u16(&src_ptr[j + pixel_step]);
+        uint32x4_t sum1_u32 = vmull_u16(filter1_u16, vget_low_u16(src1_u16));
+        uint32x4_t sum2_u32 = vmull_u16(filter1_u16, vget_high_u16(src1_u16));
+        uint16x4_t out1_u16;
+        uint16x4_t out2_u16;
+        sum1_u32 = vmlal_u16(sum1_u32, filter2_u16, vget_low_u16(src2_u16));
+        sum2_u32 = vmlal_u16(sum2_u32, filter2_u16, vget_high_u16(src2_u16));
+        out1_u16 = vshrn_n_u32(vaddq_u32(sum1_u32, round_u32), FILTER_BITS);
+        out2_u16 = vshrn_n_u32(vaddq_u32(sum2_u32, round_u32), FILTER_BITS);
+        vst1q_u16(&output_ptr[j], vcombine_u16(out1_u16, out2_u16));
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  } else {
+    assert(output_width >= 4);
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 4) {
+        const uint16x4_t src1_u16 = vld1_u16(&src_ptr[j]);
+        const uint16x4_t src2_u16 = vld1_u16(&src_ptr[j + pixel_step]);
+        uint32x4_t sum_u32 = vmull_u16(filter1_u16, src1_u16);
+        uint16x4_t out_u16;
+        sum_u32 = vmlal_u16(sum_u32, filter2_u16, src2_u16);
+        out_u16 = vshrn_n_u32(vaddq_u32(sum_u32, round_u32), FILTER_BITS);
+        vst1_u16(&output_ptr[j], out_u16);
+      }
+      // Next row...
+      src_ptr += src_pixels_per_line;
+      output_ptr += output_width;
+    }
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                                \
+  uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp2), W, \
+                                                 ref_ptr, ref_stride, sse);    \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_neon(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_neon(                   \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {                 \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp2), W, ref_ptr, ref_stride, sse);               \
+  }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                            \
+  uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_neon(                \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_8_variance##W##x##H##_neon(CONVERT_TO_BYTEPTR(temp3), W, \
+                                                 ref_ptr, ref_stride, sse);    \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_neon(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_10_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
+  }                                                                            \
+                                                                               \
+  uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_neon(               \
+      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,      \
+      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    highbd_var_filter_block2d_bil_first_pass(                                  \
+        src_ptr, fdata3, src_stride, 1, H + 1, W, bilinear_filters[x_offset]); \
+    highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,       \
+                                              bilinear_filters[y_offset]);     \
+                                                                               \
+    vpx_highbd_comp_avg_pred_neon(temp3, CONVERT_TO_SHORTPTR(second_pred), W,  \
+                                  H, temp2, W);                                \
+                                                                               \
+    return vpx_highbd_12_variance##W##x##H##_neon(                             \
+        CONVERT_TO_BYTEPTR(temp3), W, ref_ptr, ref_stride, sse);               \
+  }
+
+void vpx_highbd_comp_avg_pred_neon(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  uint32x4_t one_u32 = vdupq_n_u32(1);
+  if (width >= 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 8) {
+        const uint16x8_t pred_u16 = vld1q_u16(&pred[j]);
+        const uint16x8_t ref_u16 = vld1q_u16(&ref[j]);
+        const uint32x4_t sum1_u32 =
+            vaddl_u16(vget_low_u16(pred_u16), vget_low_u16(ref_u16));
+        const uint32x4_t sum2_u32 =
+            vaddl_u16(vget_high_u16(pred_u16), vget_high_u16(ref_u16));
+        const uint16x4_t sum1_u16 =
+            vshrn_n_u32(vaddq_u32(sum1_u32, one_u32), 1);
+        const uint16x4_t sum2_u16 =
+            vshrn_n_u32(vaddq_u32(sum2_u32, one_u32), 1);
+        const uint16x8_t vcomp_pred = vcombine_u16(sum1_u16, sum2_u16);
+        vst1q_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else {
+    assert(width >= 4);
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 4) {
+        const uint16x4_t pred_u16 = vld1_u16(&pred[j]);
+        const uint16x4_t ref_u16 = vld1_u16(&ref[j]);
+        const uint32x4_t sum_u32 = vaddl_u16(pred_u16, ref_u16);
+        const uint16x4_t vcomp_pred =
+            vshrn_n_u32(vaddq_u32(sum_u32, one_u32), 1);
+        vst1_u16(&comp_pred[j], vcomp_pred);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  }
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
diff --git a/libvpx/vpx_dsp/arm/mem_neon.h b/libvpx/vpx_dsp/arm/mem_neon.h
index 50aaa94fe..19cfc7c7f 100644
--- a/libvpx/vpx_dsp/arm/mem_neon.h
+++ b/libvpx/vpx_dsp/arm/mem_neon.h
@@ -116,11 +116,11 @@ static INLINE void uint32_to_mem(uint8_t *buf, uint32_t a) {
 static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf,
                                           ptrdiff_t stride) {
   uint32_t a;
-  uint32x2_t a_u32 = vdup_n_u32(0);
+  uint32x2_t a_u32;
   if (stride == 4) return vld1_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vset_lane_u32(a, a_u32, 0);
+  a_u32 = vdup_n_u32(a);
   memcpy(&a, buf, 4);
   a_u32 = vset_lane_u32(a, a_u32, 1);
   return vreinterpret_u8_u32(a_u32);
@@ -143,11 +143,11 @@ static INLINE void store_unaligned_u8(uint8_t *buf, ptrdiff_t stride,
 static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf,
                                             ptrdiff_t stride) {
   uint32_t a;
-  uint32x4_t a_u32 = vdupq_n_u32(0);
+  uint32x4_t a_u32;
   if (stride == 4) return vld1q_u8(buf);
   memcpy(&a, buf, 4);
   buf += stride;
-  a_u32 = vsetq_lane_u32(a, a_u32, 0);
+  a_u32 = vdupq_n_u32(a);
   memcpy(&a, buf, 4);
   buf += stride;
   a_u32 = vsetq_lane_u32(a, a_u32, 1);
@@ -201,4 +201,161 @@ static INLINE void store_u8(uint8_t *buf, ptrdiff_t stride, const uint8x8_t a) {
   buf += stride;
   vst1_lane_u32((uint32_t *)buf, a_u32, 1);
 }
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
+                                const uint8x8_t s0, const uint8x8_t s1,
+                                const uint8x8_t s2, const uint8x8_t s3,
+                                const uint8x8_t s4, const uint8x8_t s5,
+                                const uint8x8_t s6, const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void store_u8_16x8(uint8_t *s, const ptrdiff_t p,
+                                 const uint8x16_t s0, const uint8x16_t s1,
+                                 const uint8x16_t s2, const uint8x16_t s3,
+                                 const uint8x16_t s4, const uint8x16_t s5,
+                                 const uint8x16_t s6, const uint8x16_t s7) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+  s += p;
+  vst1q_u8(s, s4);
+  s += p;
+  vst1q_u8(s, s5);
+  s += p;
+  vst1q_u8(s, s6);
+  s += p;
+  vst1q_u8(s, s7);
+}
+
 #endif  // VPX_VPX_DSP_ARM_MEM_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c
index bd7818a07..9c227d560 100644
--- a/libvpx/vpx_dsp/arm/quantize_neon.c
+++ b/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -17,20 +17,57 @@
 
 static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+                                               tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
   const int32x4_t dqcoeff_0 =
       vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   const int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+  vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                const int16x8_t round, const int16x8_t quant,
+                const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+  qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          const int16_t *zbin_ptr, const int16_t *round_ptr,
                          const int16_t *quant_ptr,
@@ -38,109 +75,59 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
                          const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
-  (void)scan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vld1q_s16(zbin_ptr);
+  int16x8_t round = vld1q_s16(round_ptr);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
   {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vld1q_s16(zbin_ptr);
-    const int16x8_t round = vld1q_s16(round_ptr);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+                        quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   n_coeffs -= 8;
 
   {
-    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
-    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     do {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                          quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
-
       n_coeffs -= 8;
     } while (n_coeffs > 0);
   }
@@ -156,6 +143,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)scan;
 }
 
 static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -164,7 +154,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
 
 static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
                                                      const int16x8_t dequant,
-                                                     tran_low_t *dqcoeff) {
+                                                     tran_low_t *dqcoeff_ptr) {
   int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
   int32x4_t dqcoeff_1 =
       vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -176,14 +166,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
 #if CONFIG_VP9_HIGHBITDEPTH
   dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
   dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
 #else
-  vst1q_s16(dqcoeff,
+  vst1q_s16(dqcoeff_ptr,
             vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                      const int16x8_t round, const int16x8_t quant,
+                      const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -194,107 +221,57 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                const int16_t *scan, const int16_t *iscan) {
-  const int16x8_t one = vdupq_n_s16(1);
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   int i;
-  (void)scan;
-  (void)n_coeffs;  // Because we will always calculate 32*32.
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
 
   // Process first 8 values which include a dc component.
   {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
-    // Add one because the eob does not index from 0.
-    const uint16x8_t v_iscan =
-        vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+    const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                              quant, quant_shift, dequant);
 
     // Set non-zero elements to -1 and use that to extract values for eob.
     eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
 
+    __builtin_prefetch(coeff_ptr + 64);
     coeff_ptr += 8;
     iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
     qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
     dqcoeff_ptr += 8;
   }
 
   {
-    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
-    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
 
     for (i = 1; i < 32 * 32 / 8; ++i) {
-      // Add one because the eob is not its index.
-      const uint16x8_t v_iscan =
-          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
-
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+      const uint16x8_t v_iscan = vreinterpretq_u16_s16(vld1q_s16(iscan));
 
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                                quant, quant_shift, dequant);
 
       // Set non-zero elements to -1 and use that to extract values for eob.
       eob_max =
           vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
 
+      __builtin_prefetch(coeff_ptr + 64);
       coeff_ptr += 8;
       iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
       qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
       dqcoeff_ptr += 8;
     }
   }
@@ -310,4 +287,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
 #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
 }
diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
index 03f716c3d..5fc621aee 100644
--- a/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -20,9 +20,9 @@
 static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
                                                  const void *const buf1) {
   uint32_t a;
-  uint32x2_t aa = vdup_n_u32(0);
+  uint32x2_t aa;
   memcpy(&a, buf0, 4);
-  aa = vset_lane_u32(a, aa, 0);
+  aa = vdup_n_u32(a);
   memcpy(&a, buf1, 4);
   aa = vset_lane_u32(a, aa, 1);
   return vreinterpret_u8_u32(aa);
@@ -237,8 +237,7 @@ void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint32x4_t *const sum) {
@@ -270,7 +269,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
                               uint16x8_t *const sum) {
@@ -305,7 +304,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
   sad_512_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
@@ -327,8 +326,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
@@ -386,7 +384,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
@@ -444,12 +442,11 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad_2048_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 ////////////////////////////////////////////////////////////////////////////////
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
@@ -554,7 +551,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
-#else
+#else  // !(defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD))
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
@@ -649,4 +646,4 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
   sad_4096_pel_final_neon(sum, sad_array);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
index b1509d883..ad575d4aa 100644
--- a/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c
@@ -21,9 +21,15 @@ uint32_t vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
   const uint8x16_t src_u8 = load_unaligned_u8q(src_ptr, src_stride);
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+#if defined(__ARM_FEATURE_DOTPROD)
+  const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+  const uint32x4_t dp = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+  return horizontal_add_uint32x4(dp);
+#else
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
@@ -33,13 +39,34 @@ uint32_t vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride,
   const uint8x16_t ref_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
   const uint8x16_t second_pred_u8 = vld1q_u8(second_pred);
   const uint8x16_t avg = vrhaddq_u8(ref_u8, second_pred_u8);
+#if defined(__ARM_FEATURE_DOTPROD)
+  const uint8x16_t sad_u8 = vabdq_u8(src_u8, avg);
+  const uint32x4_t prod = vdotq_u32(vdupq_n_u32(0), sad_u8, vdupq_n_u8(1));
+  return horizontal_add_uint32x4(prod);
+#else
   uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(avg));
   abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(avg));
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *ref_ptr, int ref_stride) {
+#if defined(__ARM_FEATURE_DOTPROD)
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t src2_u8 =
+      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+  const uint8x16_t ref2_u8 =
+      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, ref1_u8);
+  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, ref2_u8);
+  prod = vdotq_u32(prod, sad1_u8, ones);
+  prod = vdotq_u32(prod, sad2_u8, ones);
+  return horizontal_add_uint32x4(prod);
+#else
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
   for (i = 0; i < 8; i += 4) {
@@ -52,11 +79,31 @@ uint32_t vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride,
   }
 
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
 uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *ref_ptr, int ref_stride,
                              const uint8_t *second_pred) {
+#if defined(__ARM_FEATURE_DOTPROD)
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  const uint8x16_t src1_u8 = load_unaligned_u8q(src_ptr, src_stride);
+  const uint8x16_t ref1_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
+  const uint8x16_t src2_u8 =
+      load_unaligned_u8q(src_ptr + 4 * src_stride, src_stride);
+  const uint8x16_t ref2_u8 =
+      load_unaligned_u8q(ref_ptr + 4 * ref_stride, ref_stride);
+  const uint8x16_t second_pred1_u8 = vld1q_u8(second_pred);
+  const uint8x16_t second_pred2_u8 = vld1q_u8(second_pred + 16);
+  const uint8x16_t avg1 = vrhaddq_u8(ref1_u8, second_pred1_u8);
+  const uint8x16_t avg2 = vrhaddq_u8(ref2_u8, second_pred2_u8);
+  const uint8x16_t sad1_u8 = vabdq_u8(src1_u8, avg1);
+  const uint8x16_t sad2_u8 = vabdq_u8(src2_u8, avg2);
+  prod = vdotq_u32(prod, sad1_u8, ones);
+  prod = vdotq_u32(prod, sad2_u8, ones);
+  return horizontal_add_uint32x4(prod);
+#else
   int i;
   uint16x8_t abs = vdupq_n_u16(0);
   for (i = 0; i < 8; i += 4) {
@@ -72,8 +119,65 @@ uint32_t vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride,
   }
 
   return horizontal_add_uint16x8(abs);
+#endif
 }
 
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x2_t sad8x(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *ref_ptr, int ref_stride,
+                               const int height) {
+  int i;
+  uint32x2_t prod = vdup_n_u32(0);
+  const uint8x8_t ones = vdup_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t sad_u8 = vabd_u8(a_u8, b_u8);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdot_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x2_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_ptr, int ref_stride,
+                                   const uint8_t *second_pred,
+                                   const int height) {
+  int i;
+  uint32x2_t prod = vdup_n_u32(0);
+  const uint8x8_t ones = vdup_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x8_t a_u8 = vld1_u8(src_ptr);
+    const uint8x8_t b_u8 = vld1_u8(ref_ptr);
+    const uint8x8_t c_u8 = vld1_u8(second_pred);
+    const uint8x8_t avg = vrhadd_u8(b_u8, c_u8);
+    const uint8x8_t sad_u8 = vabd_u8(a_u8, avg);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+    prod = vdot_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD8XN(n)                                                            \
+  uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                               const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x2_t prod =                                                  \
+        sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x2(prod);                                    \
+  }                                                                          \
+                                                                             \
+  uint32_t vpx_sad8x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                   const uint8_t *ref_ptr, int ref_stride,   \
+                                   const uint8_t *second_pred) {             \
+    const uint32x2_t prod =                                                  \
+        sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x2(prod);                                    \
+  }
+
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad8x(const uint8_t *src_ptr, int src_stride,
                                const uint8_t *ref_ptr, int ref_stride,
                                const int height) {
@@ -124,11 +228,67 @@ static INLINE uint16x8_t sad8x_avg(const uint8_t *src_ptr, int src_stride,
         sad8x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n);   \
     return horizontal_add_uint16x8(abs);                                       \
   }
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD8XN(4)
 SAD8XN(8)
 SAD8XN(16)
 
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad16x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t src_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t ref_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t sad_u8 = vabdq_u8(src_u8, ref_u8);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_u8 = vld1q_u8(src_ptr);
+    const uint8x16_t b_u8 = vld1q_u8(ref_ptr);
+    const uint8x16_t c_u8 = vld1q_u8(second_pred);
+    const uint8x16_t avg = vrhaddq_u8(b_u8, c_u8);
+    const uint8x16_t sad_u8 = vabdq_u8(a_u8, avg);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+    prod = vdotq_u32(prod, sad_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD16XN(n)                                                            \
+  uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t prod =                                                   \
+        sad16x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x4(prod);                                     \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad16x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t prod =                                                   \
+        sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x4(prod);                                     \
+  }
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -182,11 +342,78 @@ static INLINE uint16x8_t sad16x_avg(const uint8_t *src_ptr, int src_stride,
         sad16x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD16XN(8)
 SAD16XN(16)
 SAD16XN(32)
 
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad32x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, b_lo);
+    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, b_hi);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_lo_u8, ones);
+    prod = vdotq_u32(prod, sad_hi_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_lo = vld1q_u8(src_ptr);
+    const uint8x16_t a_hi = vld1q_u8(src_ptr + 16);
+    const uint8x16_t b_lo = vld1q_u8(ref_ptr);
+    const uint8x16_t b_hi = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t c_lo = vld1q_u8(second_pred);
+    const uint8x16_t c_hi = vld1q_u8(second_pred + 16);
+    const uint8x16_t avg_lo = vrhaddq_u8(b_lo, c_lo);
+    const uint8x16_t avg_hi = vrhaddq_u8(b_hi, c_hi);
+    const uint8x16_t sad_lo_u8 = vabdq_u8(a_lo, avg_lo);
+    const uint8x16_t sad_hi_u8 = vabdq_u8(a_hi, avg_hi);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+    prod = vdotq_u32(prod, sad_lo_u8, ones);
+    prod = vdotq_u32(prod, sad_hi_u8, ones);
+  }
+  return prod;
+}
+
+#define SAD32XN(n)                                                            \
+  uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
+                                const uint8_t *ref_ptr, int ref_stride) {     \
+    const uint32x4_t prod =                                                   \
+        sad32x(src_ptr, src_stride, ref_ptr, ref_stride, n);                  \
+    return horizontal_add_uint32x4(prod);                                     \
+  }                                                                           \
+                                                                              \
+  uint32_t vpx_sad32x##n##_avg_neon(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride,   \
+                                    const uint8_t *second_pred) {             \
+    const uint32x4_t prod =                                                   \
+        sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
+    return horizontal_add_uint32x4(prod);                                     \
+  }
+
+#else  // defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -250,11 +477,81 @@ static INLINE uint16x8_t sad32x_avg(const uint8_t *src_ptr, int src_stride,
         sad32x_avg(src_ptr, src_stride, ref_ptr, ref_stride, second_pred, n); \
     return horizontal_add_uint16x8(abs);                                      \
   }
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 SAD32XN(16)
 SAD32XN(32)
 SAD32XN(64)
 
+#if defined(__ARM_FEATURE_DOTPROD)
+static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride,
+                                const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, b_0);
+    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, b_1);
+    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, b_2);
+    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, b_3);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    prod = vdotq_u32(prod, sad_0_u8, ones);
+    prod = vdotq_u32(prod, sad_1_u8, ones);
+    prod = vdotq_u32(prod, sad_2_u8, ones);
+    prod = vdotq_u32(prod, sad_3_u8, ones);
+  }
+  return prod;
+}
+
+static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred,
+                                    const int height) {
+  int i;
+  uint32x4_t prod = vdupq_n_u32(0);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  for (i = 0; i < height; ++i) {
+    const uint8x16_t a_0 = vld1q_u8(src_ptr);
+    const uint8x16_t a_1 = vld1q_u8(src_ptr + 16);
+    const uint8x16_t a_2 = vld1q_u8(src_ptr + 32);
+    const uint8x16_t a_3 = vld1q_u8(src_ptr + 48);
+    const uint8x16_t b_0 = vld1q_u8(ref_ptr);
+    const uint8x16_t b_1 = vld1q_u8(ref_ptr + 16);
+    const uint8x16_t b_2 = vld1q_u8(ref_ptr + 32);
+    const uint8x16_t b_3 = vld1q_u8(ref_ptr + 48);
+    const uint8x16_t c_0 = vld1q_u8(second_pred);
+    const uint8x16_t c_1 = vld1q_u8(second_pred + 16);
+    const uint8x16_t c_2 = vld1q_u8(second_pred + 32);
+    const uint8x16_t c_3 = vld1q_u8(second_pred + 48);
+    const uint8x16_t avg_0 = vrhaddq_u8(b_0, c_0);
+    const uint8x16_t avg_1 = vrhaddq_u8(b_1, c_1);
+    const uint8x16_t avg_2 = vrhaddq_u8(b_2, c_2);
+    const uint8x16_t avg_3 = vrhaddq_u8(b_3, c_3);
+    const uint8x16_t sad_0_u8 = vabdq_u8(a_0, avg_0);
+    const uint8x16_t sad_1_u8 = vabdq_u8(a_1, avg_1);
+    const uint8x16_t sad_2_u8 = vabdq_u8(a_2, avg_2);
+    const uint8x16_t sad_3_u8 = vabdq_u8(a_3, avg_3);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+    prod = vdotq_u32(prod, sad_0_u8, ones);
+    prod = vdotq_u32(prod, sad_1_u8, ones);
+    prod = vdotq_u32(prod, sad_2_u8, ones);
+    prod = vdotq_u32(prod, sad_3_u8, ones);
+  }
+  return prod;
+}
+#else   // !defined(__ARM_FEATURE_DOTPROD)
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
                                 const int height) {
@@ -332,6 +629,7 @@ static INLINE uint32x4_t sad64x_avg(const uint8_t *src_ptr, int src_stride,
     return vpadalq_u16(sum, abs_1);
   }
 }
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 #define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index a3befdc34..9328c3ed8 100644
--- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -17,168 +17,474 @@
 #include "vpx_dsp/variance.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
-static const uint8_t bilinear_filters[8][2] = {
-  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
-  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
-};
-
 // Process a block exactly 4 wide and a multiple of 2 high.
-static void var_filter_block2d_bil_w4(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; i += 2) {
-    const uint8x8_t src_0 = load_unaligned_u8(src_ptr, src_pixels_per_line);
-    const uint8x8_t src_1 =
-        load_unaligned_u8(src_ptr + pixel_step, src_pixels_per_line);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += 2 * src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
 }
 
 // Process a block exactly 8 wide and any height.
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(output_ptr, out);
-    src_ptr += src_pixels_per_line;
-    output_ptr += 8;
-  }
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
 }
 
 // Process a block which is a mutiple of 16 wide and any height.
-static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
-                                       uint8_t *output_ptr,
-                                       unsigned int src_pixels_per_line,
-                                       int pixel_step,
-                                       unsigned int output_height,
-                                       unsigned int output_width,
-                                       const uint8_t *filter) {
-  const uint8x8_t f0 = vdup_n_u8(filter[0]);
-  const uint8x8_t f1 = vdup_n_u8(filter[1]);
-  unsigned int i, j;
-  for (i = 0; i < output_height; ++i) {
-    for (j = 0; j < output_width; j += 16) {
-      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
-      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
-      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
-      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
-      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
-      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
-      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
-      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
-      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
-    }
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x8_t out_lo = vrshrn_n_u16(blend_l, 3);
+      uint8x8_t out_hi = vrshrn_n_u16(blend_h, 3);
+      vst1q_u8(dst_ptr + j, vcombine_u8(out_lo, out_hi));
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
 }
 
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_VARIANCENXM(n, m)                                         \
-  uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-    return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
-SUB_PIXEL_VARIANCENXM(4, 4)
-SUB_PIXEL_VARIANCENXM(4, 8)
-SUB_PIXEL_VARIANCENXM(8, 4)
-SUB_PIXEL_VARIANCENXM(8, 8)
-SUB_PIXEL_VARIANCENXM(8, 16)
-SUB_PIXEL_VARIANCENXM(16, 8)
-SUB_PIXEL_VARIANCENXM(16, 16)
-SUB_PIXEL_VARIANCENXM(16, 32)
-SUB_PIXEL_VARIANCENXM(32, 16)
-SUB_PIXEL_VARIANCENXM(32, 32)
-SUB_PIXEL_VARIANCENXM(32, 64)
-SUB_PIXEL_VARIANCENXM(64, 32)
-SUB_PIXEL_VARIANCENXM(64, 64)
-
-// 4xM filter writes an extra row to fdata because it processes two rows at a
-// time.
-#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                     \
-  uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
-      const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
-      const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int vpx_sub_pixel_variance##w##x##h##_neon(                        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return vpx_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
+                                    yoffset);                                 \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);               \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));   \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);    \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),  \
+                                    xoffset);                                 \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
+
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmlal_u8(vmull_u8(s0, f0), s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l =
+          vmlal_u8(vmull_u8(vget_low_u8(s0), f0), vget_low_u8(s1), f1);
+      uint16x8_t blend_h =
+          vmlal_u8(vmull_u8(vget_high_u8(s0), f0), vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with vpx_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with vpx_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of vpx_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = vrhaddq_u8(s, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
       const uint8_t *second_pred) {                                         \
-    uint8_t temp0[n * (m + (n == 4 ? 2 : 1))];                              \
-    uint8_t temp1[n * m];                                                   \
-                                                                            \
-    if (n == 4) {                                                           \
-      var_filter_block2d_bil_w4(src_ptr, temp0, src_stride, 1, (m + 2),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w4(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else if (n == 8) {                                                    \
-      var_filter_block2d_bil_w8(src_ptr, temp0, src_stride, 1, (m + 1),     \
-                                bilinear_filters[x_offset]);                \
-      var_filter_block2d_bil_w8(temp0, temp1, n, n, m,                      \
-                                bilinear_filters[y_offset]);                \
-    } else {                                                                \
-      var_filter_block2d_bil_w16(src_ptr, temp0, src_stride, 1, (m + 1), n, \
-                                 bilinear_filters[x_offset]);               \
-      var_filter_block2d_bil_w16(temp0, temp1, n, n, m, n,                  \
-                                 bilinear_filters[y_offset]);               \
-    }                                                                       \
-                                                                            \
-    vpx_comp_avg_pred(temp0, second_pred, n, m, temp1, n);                  \
-                                                                            \
-    return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return vpx_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int vpx_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);     \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return vpx_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return vpx_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    }                                                                          \
   }
 
-SUB_PIXEL_AVG_VARIANCENXM(4, 4)
-SUB_PIXEL_AVG_VARIANCENXM(4, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 4)
-SUB_PIXEL_AVG_VARIANCENXM(8, 8)
-SUB_PIXEL_AVG_VARIANCENXM(8, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 8)
-SUB_PIXEL_AVG_VARIANCENXM(16, 16)
-SUB_PIXEL_AVG_VARIANCENXM(16, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 16)
-SUB_PIXEL_AVG_VARIANCENXM(32, 32)
-SUB_PIXEL_AVG_VARIANCENXM(32, 64)
-SUB_PIXEL_AVG_VARIANCENXM(64, 32)
-SUB_PIXEL_AVG_VARIANCENXM(64, 64)
+// 4x<h> blocks are processed two rows at a time, so require an extra row of
+// padding.
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
diff --git a/libvpx/vpx_dsp/arm/subtract_neon.c b/libvpx/vpx_dsp/arm/subtract_neon.c
index 612897e24..2c008e48a 100644
--- a/libvpx/vpx_dsp/arm/subtract_neon.c
+++ b/libvpx/vpx_dsp/arm/subtract_neon.c
@@ -79,3 +79,59 @@ void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
     } while (r);
   }
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_neon(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  int r = rows, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+
+  if (cols >= 16) {
+    do {
+      for (c = 0; c < cols; c += 16) {
+        const uint16x8_t s0 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t s1 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t p0 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t p1 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t d0 = vsubq_u16(s0, p0);
+        const uint16x8_t d1 = vsubq_u16(s1, p1);
+        vst1q_s16(&diff_ptr[c + 0], vreinterpretq_s16_u16(d0));
+        vst1q_s16(&diff_ptr[c + 8], vreinterpretq_s16_u16(d1));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 8) {
+    do {
+      for (c = 0; c < cols; c += 8) {
+        const uint16x8_t s = vld1q_u16(&src[c]);
+        const uint16x8_t p = vld1q_u16(&pred[c]);
+        const uint16x8_t d0 = vsubq_u16(s, p);
+        vst1q_s16(&diff_ptr[c], vreinterpretq_s16_u16(d0));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  } else if (cols >= 4) {
+    do {
+      for (c = 0; c < cols; c += 4) {
+        const uint16x4_t s = vld1_u16(&src[c]);
+        const uint16x4_t p = vld1_u16(&pred[c]);
+        const uint16x4_t v_diff = vsub_u16(s, p);
+        vst1_s16(&diff_ptr[c], vreinterpret_s16_u16(v_diff));
+      }
+      diff_ptr += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h
index c098ad31b..41d44f2b1 100644
--- a/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -568,6 +568,40 @@ static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
 }
 
+// Transpose 8x8 to a new location.
+static INLINE void transpose_s16_8x8_new(const int16x8_t *a, int16x8_t *b) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
                                      int16x8_t *a2, int16x8_t *a3,
                                      int16x8_t *a4, int16x8_t *a5,
@@ -787,6 +821,51 @@ static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
   a7->val[1] = c7.val[1];
 }
 
+// Helper transpose function for highbd FDCT variants
+static INLINE void transpose_s32_8x8_2(int32x4_t *left /*[8]*/,
+                                       int32x4_t *right /*[8]*/,
+                                       int32x4_t *out_left /*[8]*/,
+                                       int32x4_t *out_right /*[8]*/) {
+  int32x4x2_t out[8];
+
+  out[0].val[0] = left[0];
+  out[0].val[1] = right[0];
+  out[1].val[0] = left[1];
+  out[1].val[1] = right[1];
+  out[2].val[0] = left[2];
+  out[2].val[1] = right[2];
+  out[3].val[0] = left[3];
+  out[3].val[1] = right[3];
+  out[4].val[0] = left[4];
+  out[4].val[1] = right[4];
+  out[5].val[0] = left[5];
+  out[5].val[1] = right[5];
+  out[6].val[0] = left[6];
+  out[6].val[1] = right[6];
+  out[7].val[0] = left[7];
+  out[7].val[1] = right[7];
+
+  transpose_s32_8x8(&out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                    &out[6], &out[7]);
+
+  out_left[0] = out[0].val[0];
+  out_left[1] = out[1].val[0];
+  out_left[2] = out[2].val[0];
+  out_left[3] = out[3].val[0];
+  out_left[4] = out[4].val[0];
+  out_left[5] = out[5].val[0];
+  out_left[6] = out[6].val[0];
+  out_left[7] = out[7].val[0];
+  out_right[0] = out[0].val[1];
+  out_right[1] = out[1].val[1];
+  out_right[2] = out[2].val[1];
+  out_right[3] = out[3].val[1];
+  out_right[4] = out[4].val[1];
+  out_right[5] = out[5].val[1];
+  out_right[6] = out[6].val[1];
+  out_right[7] = out[7].val[1];
+}
+
 static INLINE void transpose_u8_16x8(
     const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
     const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
index 7b93f142b..3ccc4e807 100644
--- a/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libvpx/vpx_dsp/arm/variance_neon.c
@@ -19,345 +19,357 @@
 #include "vpx_dsp/arm/sum_neon.h"
 #include "vpx_ports/mem.h"
 
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
 
 // Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i;
-  uint32x4_t sum_a = vdupq_n_u32(0);
-  uint32x4_t sum_b = vdupq_n_u32(0);
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t b = load_unaligned_u8q(ref_ptr, ref_stride);
+  int i = h;
+  do {
+    const uint8x16_t s = load_unaligned_u8q(src_ptr, src_stride);
+    const uint8x16_t r = load_unaligned_u8q(ref_ptr, ref_stride);
 
-    const uint8x16_t abs_diff = vabdq_u8(a, b);
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
     sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
 
-    sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
-    sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
 
     src_ptr += 4 * src_stride;
     ref_ptr += 4 * ref_stride;
-  }
+    i -= 4;
+  } while (i != 0);
 
-  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
   *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride, int w,
-                              int h, uint32_t *sse, int *sum) {
-  int i, j;
-  uint32x4_t sum_a = vdupq_n_u32(0);
-  uint32x4_t sum_b = vdupq_n_u32(0);
+// Process a block of width 8 two rows at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
   uint32x4_t sse_u32 = vdupq_n_u32(0);
 
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a = vld1q_u8(src_ptr + j);
-      const uint8x16_t b = vld1q_u8(ref_ptr + j);
+  int i = h;
+  do {
+    const uint8x16_t s =
+        vcombine_u8(vld1_u8(src_ptr), vld1_u8(src_ptr + src_stride));
+    const uint8x16_t r =
+        vcombine_u8(vld1_u8(ref_ptr), vld1_u8(ref_ptr + ref_stride));
 
-      const uint8x16_t abs_diff = vabdq_u8(a, b);
-      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
+}
+
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
 
-      sum_a = vdotq_u32(sum_a, a, vdupq_n_u8(1));
-      sum_b = vdotq_u32(sum_b, b, vdupq_n_u8(1));
-    }
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
-  *sum = horizontal_add_int32x4(vreinterpretq_s32_u32(vsubq_u32(sum_a, sum_b)));
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
   *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i = 0;
-  uint32x2_t sum_a = vdup_n_u32(0);
-  uint32x2_t sum_b = vdup_n_u32(0);
-  uint32x2_t sse_lo_u32 = vdup_n_u32(0);
-  uint32x2_t sse_hi_u32 = vdup_n_u32(0);
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
 
+  int i = h;
   do {
-    const uint8x8_t a_0 = vld1_u8(src_ptr);
-    const uint8x8_t a_1 = vld1_u8(src_ptr + src_stride);
-    const uint8x8_t b_0 = vld1_u8(ref_ptr);
-    const uint8x8_t b_1 = vld1_u8(ref_ptr + ref_stride);
-
-    const uint8x8_t abs_diff_0 = vabd_u8(a_0, b_0);
-    const uint8x8_t abs_diff_1 = vabd_u8(a_1, b_1);
-    sse_lo_u32 = vdot_u32(sse_lo_u32, abs_diff_0, abs_diff_0);
-    sse_hi_u32 = vdot_u32(sse_hi_u32, abs_diff_1, abs_diff_1);
-
-    sum_a = vdot_u32(sum_a, a_0, vdup_n_u8(1));
-    sum_b = vdot_u32(sum_b, b_0, vdup_n_u8(1));
-    sum_a = vdot_u32(sum_a, a_1, vdup_n_u8(1));
-    sum_b = vdot_u32(sum_b, b_1, vdup_n_u8(1));
-
-    src_ptr += src_stride + src_stride;
-    ref_ptr += ref_stride + ref_stride;
-    i += 2;
-  } while (i < h);
+    int j = 0;
+    do {
+      const uint8x16_t s = vld1q_u8(src_ptr + j);
+      const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+      const uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      j += 16;
+    } while (j < w);
 
-  *sum = horizontal_add_int32x2(vreinterpret_s32_u32(vsub_u32(sum_a, sum_b)));
-  *sse = horizontal_add_uint32x2(vadd_u32(sse_lo_u32, sse_hi_u32));
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int32x4(
+      vreinterpretq_s32_u32(vsubq_u32(src_sum, ref_sum)));
+  *sse = horizontal_add_uint32x4(sse_u32);
 }
 
-#else
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
 
-// The variance helper functions use int16_t for sum. 8 values are accumulated
-// and then added (at which point they expand up to int32_t). To avoid overflow,
-// there can be no more than 32767 / 255 ~= 128 values accumulated in each
-// column. For a 32x32 buffer, this results in 32 / 8 = 4 values per row * 32
-// rows = 128. Asserts have been added to each function to warn against reaching
-// this limit.
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
 
-// Process a block of width 4 four rows at a time.
-static void variance_neon_w4x4(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i;
+#else  // !defined(__ARM_FEATURE_DOTPROD)
+
+// Process a block of width 4 two rows at a time.
+static INLINE void variance_4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+  int i = h;
 
-  // Since width is only 4, sum_s16 only loads a half row per loop.
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
   assert(h <= 256);
 
-  for (i = 0; i < h; i += 4) {
-    const uint8x16_t a_u8 = load_unaligned_u8q(src_ptr, src_stride);
-    const uint8x16_t b_u8 = load_unaligned_u8q(ref_ptr, ref_stride);
-    const uint16x8_t diff_lo_u16 =
-        vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-    const uint16x8_t diff_hi_u16 =
-        vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-    const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-    const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-    sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
+  do {
+    const uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    const uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
 
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                           vget_low_s16(diff_lo_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                           vget_high_s16(diff_lo_s16));
+    sum_s16 = vaddq_s16(sum_s16, diff);
 
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                           vget_low_s16(diff_hi_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                           vget_high_s16(diff_hi_s16));
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
 
-    src_ptr += 4 * src_stride;
-    ref_ptr += 4 * ref_stride;
-  }
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
 
   *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sse = (uint32_t)horizontal_add_int32x4(sse_s32);
 }
 
-// Process a block of any size where the width is divisible by 16.
-static void variance_neon_w16(const uint8_t *src_ptr, int src_stride,
-                              const uint8_t *ref_ptr, int ref_stride, int w,
-                              int h, uint32_t *sse, int *sum) {
-  int i, j;
+// Process a block of width 8 one row at a time.
+static INLINE void variance_8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     int h, uint32_t *sse, int *sum) {
   int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
-
-  // The loop loads 16 values at a time but doubles them up when accumulating
-  // into sum_s16.
-  assert(w / 8 * h <= 128);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 16) {
-      const uint8x16_t a_u8 = vld1q_u8(src_ptr + j);
-      const uint8x16_t b_u8 = vld1q_u8(ref_ptr + j);
-
-      const uint16x8_t diff_lo_u16 =
-          vsubl_u8(vget_low_u8(a_u8), vget_low_u8(b_u8));
-      const uint16x8_t diff_hi_u16 =
-          vsubl_u8(vget_high_u8(a_u8), vget_high_u8(b_u8));
-
-      const int16x8_t diff_lo_s16 = vreinterpretq_s16_u16(diff_lo_u16);
-      const int16x8_t diff_hi_s16 = vreinterpretq_s16_u16(diff_hi_u16);
-
-      sum_s16 = vaddq_s16(sum_s16, diff_lo_s16);
-      sum_s16 = vaddq_s16(sum_s16, diff_hi_s16);
-
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_lo_s16),
-                             vget_low_s16(diff_lo_s16));
-      sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_high_s16(diff_lo_s16),
-                             vget_high_s16(diff_lo_s16));
-
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_low_s16(diff_hi_s16),
-                             vget_low_s16(diff_hi_s16));
-      sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_hi_s16),
-                             vget_high_s16(diff_hi_s16));
-    }
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  do {
+    const uint8x8_t s = vld1_u8(src_ptr);
+    const uint8x8_t r = vld1_u8(ref_ptr);
+    const int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
     src_ptr += src_stride;
     ref_ptr += ref_stride;
-  }
+  } while (--i != 0);
 
   *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-// Process a block of width 8 two rows at a time.
-static void variance_neon_w8x2(const uint8_t *src_ptr, int src_stride,
-                               const uint8_t *ref_ptr, int ref_stride, int h,
-                               uint32_t *sse, int *sum) {
-  int i = 0;
-  int16x8_t sum_s16 = vdupq_n_s16(0);
-  int32x4_t sse_lo_s32 = vdupq_n_s32(0);
-  int32x4_t sse_hi_s32 = vdupq_n_s32(0);
+// Process a block of width 16 one row at a time.
+static INLINE void variance_16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      int h, uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int i = h;
 
-  // Each column has it's own accumulator entry in sum_s16.
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
   assert(h <= 128);
 
   do {
-    const uint8x8_t a_0_u8 = vld1_u8(src_ptr);
-    const uint8x8_t a_1_u8 = vld1_u8(src_ptr + src_stride);
-    const uint8x8_t b_0_u8 = vld1_u8(ref_ptr);
-    const uint8x8_t b_1_u8 = vld1_u8(ref_ptr + ref_stride);
-    const uint16x8_t diff_0_u16 = vsubl_u8(a_0_u8, b_0_u8);
-    const uint16x8_t diff_1_u16 = vsubl_u8(a_1_u8, b_1_u8);
-    const int16x8_t diff_0_s16 = vreinterpretq_s16_u16(diff_0_u16);
-    const int16x8_t diff_1_s16 = vreinterpretq_s16_u16(diff_1_u16);
-    sum_s16 = vaddq_s16(sum_s16, diff_0_s16);
-    sum_s16 = vaddq_s16(sum_s16, diff_1_s16);
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_0_s16),
-                           vget_low_s16(diff_0_s16));
-    sse_lo_s32 = vmlal_s16(sse_lo_s32, vget_low_s16(diff_1_s16),
-                           vget_low_s16(diff_1_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_0_s16),
-                           vget_high_s16(diff_0_s16));
-    sse_hi_s32 = vmlal_s16(sse_hi_s32, vget_high_s16(diff_1_s16),
-                           vget_high_s16(diff_1_s16));
-    src_ptr += src_stride + src_stride;
-    ref_ptr += ref_stride + ref_stride;
-    i += 2;
+    const uint8x16_t s = vld1q_u8(src_ptr);
+    const uint8x16_t r = vld1q_u8(ref_ptr);
+
+    const int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    const int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_int16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+// Process a block of any size where the width is divisible by 16.
+static INLINE void variance_large_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int w, int h, int h_limit,
+                                       unsigned int *sse, int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s = vld1q_u8(src_ptr + j);
+        const uint8x16_t r = vld1q_u8(ref_ptr + j);
+
+        const int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        const int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
   } while (i < h);
 
-  *sum = horizontal_add_int16x8(sum_s16);
-  *sse = horizontal_add_uint32x4(
-      vreinterpretq_u32_s32(vaddq_s32(sse_lo_s32, sse_hi_s32)));
+  *sum = horizontal_add_int32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_int32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
 }
 
-#endif
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+#endif  // defined(__ARM_FEATURE_DOTPROD)
 
 void vpx_get8x8var_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *ref_ptr, int ref_stride,
                         unsigned int *sse, int *sum) {
-  variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
+  variance_8xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, sse, sum);
 }
 
 void vpx_get16x16var_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *ref_ptr, int ref_stride,
                           unsigned int *sse, int *sum) {
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
+  variance_16xh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, sse, sum);
 }
 
-#define VARIANCENXM(n, m, shift)                                             \
-  unsigned int vpx_variance##n##x##m##_neon(                                 \
-      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
-      int ref_stride, unsigned int *sse) {                                   \
-    int sum;                                                                 \
-    if (n == 4)                                                              \
-      variance_neon_w4x4(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else if (n == 8)                                                         \
-      variance_neon_w8x2(src_ptr, src_stride, ref_ptr, ref_stride, m, sse,   \
-                         &sum);                                              \
-    else                                                                     \
-      variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, n, m, sse, \
-                        &sum);                                               \
-    if (n * m < 16 * 16)                                                     \
-      return *sse - ((sum * sum) >> shift);                                  \
-    else                                                                     \
-      return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int vpx_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
   }
 
-VARIANCENXM(4, 4, 4)
-VARIANCENXM(4, 8, 5)
-VARIANCENXM(8, 4, 5)
-VARIANCENXM(8, 8, 6)
-VARIANCENXM(8, 16, 7)
-VARIANCENXM(16, 8, 7)
-VARIANCENXM(16, 16, 8)
-VARIANCENXM(16, 32, 9)
-VARIANCENXM(32, 16, 9)
-VARIANCENXM(32, 32, 10)
-
-unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 32, 32, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (32 * src_stride), src_stride,
-                    ref_ptr + (32 * ref_stride), ref_stride, 32, 32, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
 
-unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 11);
-}
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
 
-unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int src_stride,
-                                    const uint8_t *ref_ptr, int ref_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 64, 16, &sse1,
-                    &sum1);
-  variance_neon_w16(src_ptr + (16 * src_stride), src_stride,
-                    ref_ptr + (16 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 2 * src_stride), src_stride,
-                    ref_ptr + (16 * 2 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w16(src_ptr + (16 * 3 * src_stride), src_stride,
-                    ref_ptr + (16 * 3 * ref_stride), ref_stride, 64, 16, &sse2,
-                    &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (unsigned int)(((int64_t)sum1 * sum1) >> 12);
-}
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
 
-#if defined(__ARM_FEATURE_DOTPROD) && (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__ARM_FEATURE_DOTPROD)
 
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
@@ -421,7 +433,7 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return vget_lane_u32(sse, 0);
 }
 
-#else
+#else  // !defined(__ARM_FEATURE_DOTPROD)
 
 unsigned int vpx_mse16x16_neon(const unsigned char *src_ptr, int src_stride,
                                const unsigned char *ref_ptr, int ref_stride,
@@ -518,4 +530,4 @@ unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int src_stride,
   return horizontal_add_uint32x4(vreinterpretq_u32_s32(sse));
 }
 
-#endif
+#endif  // defined(__ARM_FEATURE_DOTPROD)
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
index 06b58c438..b4cdd58c7 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,8 +31,9 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
+#if defined(__aarch64__) && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
   0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
   4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
@@ -53,9 +54,176 @@ DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
   3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
 };
 
-static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
-                                        int8x8_t *a2, int8x8_t *a3,
-                                        int8x16_t *b,
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+
+void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const InterpKernel *filter, int x0_q4,
+                              int x_step_q4, int y0_q4, int y_step_q4, int w,
+                              int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *filter, int x0_q4,
+                                  int x_step_q4, int y0_q4, int y_step_q4,
+                                  int w, int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[x0_q4]));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(x_step_q4 == 16);
+
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  src -= 3;
+
+  if (w == 4) {
+    const uint8x16x2_t permute_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int32x4_t t0, t1, t2, t3;
+      int16x8_t t01, t23;
+      uint8x8_t d01, d23, dd01, dd23;
+      dd01 = vdup_n_u8(0);
+      dd23 = vdup_n_u8(0);
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filters, permute_tbl);
+      t1 = convolve8_4_usdot(s1, filters, permute_tbl);
+      t2 = convolve8_4_usdot(s2, filters, permute_tbl);
+      t3 = convolve8_4_usdot(s3, filters, permute_tbl);
+      t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
+      t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
+      d01 = vqrshrun_n_s16(t01, 7);
+      d23 = vqrshrun_n_s16(t23, 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t permute_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filters, permute_tbl);
+        d1 = convolve8_8_usdot(s1, filters, permute_tbl);
+        d2 = convolve8_8_usdot(s2, filters, permute_tbl);
+        d3 = convolve8_8_usdot(s3, filters, permute_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width > 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
                                         const uint8x16_t permute_tbl) {
   /* Transpose 8-bit elements and concatenate result rows as follows:
    * a0: 00, 01, 02, 03, XX, XX, XX, XX
@@ -70,13 +238,13 @@ static INLINE void transpose_concat_4x4(int8x8_t *a0, int8x8_t *a1,
    * inline helper is called many times from the same parent function.
    */
 
-  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
-  *b = vqtbl2q_s8(samples, permute_tbl);
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
 }
 
-static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
-                                        int8x8_t *a2, int8x8_t *a3,
-                                        int8x16_t *b0, int8x16_t *b1,
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
                                         const uint8x16x2_t permute_tbl) {
   /* Transpose 8-bit elements and concatenate result rows as follows:
    * a0: 00, 01, 02, 03, 04, 05, 06, 07
@@ -92,11 +260,364 @@ static INLINE void transpose_concat_8x4(int8x8_t *a0, int8x8_t *a1,
    * inline helper is called many times from the same parent function.
    */
 
-  int8x16x2_t samples = { { vcombine_s8(*a0, *a1), vcombine_s8(*a2, *a3) } };
-  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
-  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
 }
 
+void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int8x8_t filters = vmovn_s16(vld1q_s16(filter[y0_q4]));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert(!((intptr_t)dst & 3));
+  assert(!(dst_stride & 3));
+  assert(y_step_q4 == 16);
+
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= 3 * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int32x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23, dd01, dd23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filters);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filters);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filters);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filters);
+      d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
+      d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
+
+      dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
+      dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
+      d01 = vrhadd_u8(d01, dd01);
+      d23 = vrhadd_u8(d23, dd23);
+
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3, dd0, dd1, dd2, dd3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filters);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filters);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filters);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
+        d0 = vrhadd_u8(d0, dd0);
+        d1 = vrhadd_u8(d1, dd1);
+        d2 = vrhadd_u8(d2, dd2);
+        d3 = vrhadd_u8(d3, dd3);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+#else  // !defined(__ARM_FEATURE_MATMUL_INT8)
+
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *filter, int x0_q4,
@@ -125,33 +646,22 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       int16x8_t t01, t23;
       uint8x8_t d01, d23;
 
-      s0 = vld1q_u8(src);
-      src += src_stride;
-      s1 = vld1q_u8(src);
-      src += src_stride;
-      s2 = vld1q_u8(src);
-      src += src_stride;
-      s3 = vld1q_u8(src);
-      src += src_stride;
-
-      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
       d23 = vqrshrun_n_s16(t23, 7);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -166,20 +676,18 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
       do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
 
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
+        d0 =
+            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+        d1 =
+            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+        d2 =
+            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+        d3 =
+            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
@@ -222,20 +730,12 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       dd01 = vdup_n_u8(0);
       dd23 = vdup_n_u8(0);
 
-      s0 = vld1q_u8(src);
-      src += src_stride;
-      s1 = vld1q_u8(src);
-      src += src_stride;
-      s2 = vld1q_u8(src);
-      src += src_stride;
-      s3 = vld1q_u8(src);
-      src += src_stride;
-
-      t0 = convolve8_4_dot(s0, filters, correction, range_limit, permute_tbl);
-      t1 = convolve8_4_dot(s1, filters, correction, range_limit, permute_tbl);
-      t2 = convolve8_4_dot(s2, filters, correction, range_limit, permute_tbl);
-      t3 = convolve8_4_dot(s3, filters, correction, range_limit, permute_tbl);
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
+      t0 = convolve8_4_sdot(s0, filters, correction, range_limit, permute_tbl);
+      t1 = convolve8_4_sdot(s1, filters, correction, range_limit, permute_tbl);
+      t2 = convolve8_4_sdot(s2, filters, correction, range_limit, permute_tbl);
+      t3 = convolve8_4_sdot(s3, filters, correction, range_limit, permute_tbl);
       t01 = vcombine_s16(vqmovn_s32(t0), vqmovn_s32(t1));
       t23 = vcombine_s16(vqmovn_s32(t2), vqmovn_s32(t3));
       d01 = vqrshrun_n_s16(t01, 7);
@@ -243,17 +743,15 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -268,29 +766,25 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
       do {
-        s0 = vld1q_u8(s + 0 * src_stride);
-        s1 = vld1q_u8(s + 1 * src_stride);
-        s2 = vld1q_u8(s + 2 * src_stride);
-        s3 = vld1q_u8(s + 3 * src_stride);
-
-        d0 = convolve8_8_dot(s0, filters, correction, range_limit, permute_tbl);
-        d1 = convolve8_8_dot(s1, filters, correction, range_limit, permute_tbl);
-        d2 = convolve8_8_dot(s2, filters, correction, range_limit, permute_tbl);
-        d3 = convolve8_8_dot(s3, filters, correction, range_limit, permute_tbl);
-
-        dd0 = vld1_u8(d + 0 * dst_stride);
-        dd1 = vld1_u8(d + 1 * dst_stride);
-        dd2 = vld1_u8(d + 2 * dst_stride);
-        dd3 = vld1_u8(d + 3 * dst_stride);
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 =
+            convolve8_8_sdot(s0, filters, correction, range_limit, permute_tbl);
+        d1 =
+            convolve8_8_sdot(s1, filters, correction, range_limit, permute_tbl);
+        d2 =
+            convolve8_8_sdot(s2, filters, correction, range_limit, permute_tbl);
+        d3 =
+            convolve8_8_sdot(s3, filters, correction, range_limit, permute_tbl);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
         d0 = vrhadd_u8(d0, dd0);
         d1 = vrhadd_u8(d1, dd1);
         d2 = vrhadd_u8(d2, dd2);
         d3 = vrhadd_u8(d3, dd3);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
@@ -303,6 +797,49 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
 void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const InterpKernel *filter, int x0_q4,
@@ -333,14 +870,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int32x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23;
 
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
 
     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -357,13 +888,13 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -375,7 +906,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -384,22 +915,15 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       /* Prepare block for next iteration - re-using as much as possible. */
       /* Shuffle everything up four rows. */
@@ -409,6 +933,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s3456 = s78910;
 
       src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -426,14 +951,8 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
 
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
 
       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -450,19 +969,19 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
                            tran_concat_tbl);
 
       do {
@@ -475,7 +994,7 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
                              tran_concat_tbl);
 
         /* Merge new data into block from previous iteration. */
@@ -491,18 +1010,16 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         /* Prepare block for next iteration - re-using as much as possible. */
         /* Shuffle everything up four rows. */
@@ -556,14 +1073,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     int32x4_t d0, d1, d2, d3;
     uint8x8_t d01, d23, dd01, dd23;
 
-    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
-    src += 4 * src_stride;
-    t4 = vld1_u8(src);
-    src += src_stride;
-    t5 = vld1_u8(src);
-    src += src_stride;
-    t6 = vld1_u8(src);
-    src += src_stride;
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
 
     /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
     s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -580,13 +1091,13 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
     /* This operation combines a conventional transpose and the sample permute
      * (see horizontal case) required before computing the dot product.
      */
-    transpose_concat_4x4(&s0, &s1, &s2, &s3, &s0123, tran_concat_tbl);
-    transpose_concat_4x4(&s1, &s2, &s3, &s4, &s1234, tran_concat_tbl);
-    transpose_concat_4x4(&s2, &s3, &s4, &s5, &s2345, tran_concat_tbl);
-    transpose_concat_4x4(&s3, &s4, &s5, &s6, &s3456, tran_concat_tbl);
-    transpose_concat_4x4(&s4, &s5, &s6, &s7, &s4567, tran_concat_tbl);
-    transpose_concat_4x4(&s5, &s6, &s7, &s8, &s5678, tran_concat_tbl);
-    transpose_concat_4x4(&s6, &s7, &s8, &s9, &s6789, tran_concat_tbl);
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
 
     do {
       uint8x8_t t7, t8, t9, t10;
@@ -598,7 +1109,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
       s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-      transpose_concat_4x4(&s7, &s8, &s9, &s10, &s78910, tran_concat_tbl);
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
 
       /* Merge new data into block from previous iteration. */
       samples_LUT.val[0] = s3456;
@@ -607,27 +1118,21 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
       s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-      d0 = convolve8_4_dot_partial(s0123, s4567, correction, filters);
-      d1 = convolve8_4_dot_partial(s1234, s5678, correction, filters);
-      d2 = convolve8_4_dot_partial(s2345, s6789, correction, filters);
-      d3 = convolve8_4_dot_partial(s3456, s78910, correction, filters);
-
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filters);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filters);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filters);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filters);
       d01 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d0), vqmovn_s32(d1)), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(vqmovn_s32(d2), vqmovn_s32(d3)), 7);
 
       dd01 = load_u8(dst + 0 * dst_stride, dst_stride);
       dd23 = load_u8(dst + 2 * dst_stride, dst_stride);
+
       d01 = vrhadd_u8(d01, dd01);
       d23 = vrhadd_u8(d23, dd23);
 
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d01), 1);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 0);
-      dst += dst_stride;
-      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d23), 1);
-      dst += dst_stride;
+      store_u8(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8(dst + 2 * dst_stride, dst_stride, d23);
 
       /* Prepare block for next iteration - re-using as much as possible. */
       /* Shuffle everything up four rows. */
@@ -637,6 +1142,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s3456 = s78910;
 
       src += 4 * src_stride;
+      dst += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
@@ -654,14 +1160,8 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       s = src;
       d = dst;
 
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      s += 4 * src_stride;
-      t4 = vld1_u8(s);
-      s += src_stride;
-      t5 = vld1_u8(s);
-      s += src_stride;
-      t6 = vld1_u8(s);
-      s += src_stride;
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
 
       /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
       s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
@@ -678,19 +1178,19 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       /* This operation combines a conventional transpose and the sample permute
        * (see horizontal case) required before computing the dot product.
        */
-      transpose_concat_8x4(&s0, &s1, &s2, &s3, &s0123_lo, &s0123_hi,
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s1, &s2, &s3, &s4, &s1234_lo, &s1234_hi,
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s2, &s3, &s4, &s5, &s2345_lo, &s2345_hi,
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s3, &s4, &s5, &s6, &s3456_lo, &s3456_hi,
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s4, &s5, &s6, &s7, &s4567_lo, &s4567_hi,
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s5, &s6, &s7, &s8, &s5678_lo, &s5678_hi,
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
                            tran_concat_tbl);
-      transpose_concat_8x4(&s6, &s7, &s8, &s9, &s6789_lo, &s6789_hi,
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
                            tran_concat_tbl);
 
       do {
@@ -703,7 +1203,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
         s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
 
-        transpose_concat_8x4(&s7, &s8, &s9, &s10, &s78910_lo, &s78910_hi,
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
                              tran_concat_tbl);
 
         /* Merge new data into block from previous iteration. */
@@ -719,28 +1219,23 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
         s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
 
-        d0 = convolve8_8_dot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
-                                     correction, filters);
-        d1 = convolve8_8_dot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
-                                     correction, filters);
-        d2 = convolve8_8_dot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
-                                     correction, filters);
-        d3 = convolve8_8_dot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
-                                     correction, filters);
-
-        dd0 = vld1_u8(d + 0 * dst_stride);
-        dd1 = vld1_u8(d + 1 * dst_stride);
-        dd2 = vld1_u8(d + 2 * dst_stride);
-        dd3 = vld1_u8(d + 3 * dst_stride);
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filters);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filters);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filters);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filters);
+
+        load_u8_8x4(d, dst_stride, &dd0, &dd1, &dd2, &dd3);
+
         d0 = vrhadd_u8(d0, dd0);
         d1 = vrhadd_u8(d1, dd1);
         d2 = vrhadd_u8(d2, dd2);
         d3 = vrhadd_u8(d3, dd3);
 
-        vst1_u8(d + 0 * dst_stride, d0);
-        vst1_u8(d + 1 * dst_stride, d1);
-        vst1_u8(d + 2 * dst_stride, d2);
-        vst1_u8(d + 3 * dst_stride, d3);
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         /* Prepare block for next iteration - re-using as much as possible. */
         /* Shuffle everything up four rows. */
@@ -764,29 +1259,11 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#else
-
-static INLINE void store_u8_8x8(uint8_t *s, const ptrdiff_t p,
-                                const uint8x8_t s0, const uint8x8_t s1,
-                                const uint8x8_t s2, const uint8x8_t s3,
-                                const uint8x8_t s4, const uint8x8_t s5,
-                                const uint8x8_t s6, const uint8x8_t s7) {
-  vst1_u8(s, s0);
-  s += p;
-  vst1_u8(s, s1);
-  s += p;
-  vst1_u8(s, s2);
-  s += p;
-  vst1_u8(s, s3);
-  s += p;
-  vst1_u8(s, s4);
-  s += p;
-  vst1_u8(s, s5);
-  s += p;
-  vst1_u8(s, s6);
-  s += p;
-  vst1_u8(s, s7);
-}
+#endif  // defined(__ARM_FEATURE_MATMUL_INT8)
+
+#else  // !(defined(__aarch64__) &&
+       //   (defined(__ARM_FEATURE_DOTPROD) ||
+       //    defined(__ARM_FEATURE_MATMUL_INT8)))
 
 void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
@@ -808,16 +1285,13 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (h == 4) {
     uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     int16x8_t tt0, tt1, tt2, tt3;
 
     __builtin_prefetch(src + 0 * src_stride);
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -849,14 +1323,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -883,8 +1353,6 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
@@ -927,14 +1395,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
         vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(t0), 0);
@@ -1002,22 +1466,14 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
           store_u8_8x8(d, dst_stride, t0, t1, t2, t3, t4, t5, t6, t7);
@@ -1061,8 +1517,7 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
 
   if (h == 4) {
     uint8x8_t d01, d23;
-    int16x4_t filter3, filter4, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0,
-        d1, d2, d3;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     int16x8_t tt0, tt1, tt2, tt3;
     uint32x4_t d0123 = vdupq_n_u32(0);
 
@@ -1070,8 +1525,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
     __builtin_prefetch(src + 1 * src_stride);
     __builtin_prefetch(src + 2 * src_stride);
     __builtin_prefetch(src + 3 * src_stride);
-    filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
     transpose_u8_8x4(&t0, &t1, &t2, &t3);
     tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
@@ -1103,14 +1556,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       s9 = vget_low_s16(tt2);
       s10 = vget_low_s16(tt3);
 
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1140,8 +1589,6 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
       w -= 4;
     } while (w != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int width;
     const uint8_t *s;
     uint8x8_t t4, t5, t6, t7;
@@ -1186,14 +1633,10 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(src + 5 * src_stride);
         __builtin_prefetch(src + 6 * src_stride);
         __builtin_prefetch(src + 7 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
@@ -1276,22 +1719,14 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
           s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
           s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
 
-          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                           filter4);
-          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                           filter4);
-          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                           filter4);
-          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                           filter4);
-          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, filter3,
-                           filter4);
-          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, filter3,
-                           filter4);
-          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, filter3,
-                           filter4);
-          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters,
-                           filter3, filter4);
+          t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+          t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+          t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+          t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
+          t4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters);
+          t5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters);
+          t6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters);
+          t7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters);
 
           transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
@@ -1349,8 +1784,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
 
@@ -1387,14 +1820,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1417,8 +1846,6 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
@@ -1469,14 +1896,10 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         vst1_u8(d, t0);
         d += dst_stride;
@@ -1521,8 +1944,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   src -= 3 * src_stride;
 
   if (w == 4) {
-    const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-    const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
     uint8x8_t d01, d23;
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
     uint32x4_t d0123 = vdupq_n_u32(0);
@@ -1560,14 +1981,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       __builtin_prefetch(src + 1 * src_stride);
       __builtin_prefetch(src + 2 * src_stride);
       __builtin_prefetch(src + 3 * src_stride);
-      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                       filter4);
-      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                       filter4);
-      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                       filter4);
-      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                       filter4);
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
       d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), 7);
       d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), 7);
@@ -1598,8 +2015,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
       h -= 4;
     } while (h != 0);
   } else {
-    const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-    const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
     int height;
     const uint8_t *s;
     uint8_t *d;
@@ -1651,14 +2066,10 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
         __builtin_prefetch(s + 1 * src_stride);
         __builtin_prefetch(s + 2 * src_stride);
         __builtin_prefetch(s + 3 * src_stride);
-        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3,
-                         filter4);
-        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3,
-                         filter4);
-        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, filter3,
-                         filter4);
-        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, filter3,
-                         filter4);
+        t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters);
+        t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters);
+        t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters);
+        t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters);
 
         d01 = vcombine_u8(t0, t1);
         d23 = vcombine_u8(t2, t3);
@@ -1694,4 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif
+#endif  // #if defined(__aarch64__) &&
+        //     (defined(__ARM_FEATURE_DOTPROD) ||
+        //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
index 857b6d54e..ed7f18053 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,69 +16,12 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-}
-
-static INLINE void load_u8_8x8(const uint8_t *s, const ptrdiff_t p,
-                               uint8x8_t *const s0, uint8x8_t *const s1,
-                               uint8x8_t *const s2, uint8x8_t *const s3,
-                               uint8x8_t *const s4, uint8x8_t *const s5,
-                               uint8x8_t *const s6, uint8x8_t *const s7) {
-  *s0 = vld1_u8(s);
-  s += p;
-  *s1 = vld1_u8(s);
-  s += p;
-  *s2 = vld1_u8(s);
-  s += p;
-  *s3 = vld1_u8(s);
-  s += p;
-  *s4 = vld1_u8(s);
-  s += p;
-  *s5 = vld1_u8(s);
-  s += p;
-  *s6 = vld1_u8(s);
-  s += p;
-  *s7 = vld1_u8(s);
-}
+#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
 
-static INLINE void load_u8_16x8(const uint8_t *s, const ptrdiff_t p,
-                                uint8x16_t *const s0, uint8x16_t *const s1,
-                                uint8x16_t *const s2, uint8x16_t *const s3,
-                                uint8x16_t *const s4, uint8x16_t *const s5,
-                                uint8x16_t *const s6, uint8x16_t *const s7) {
-  *s0 = vld1q_u8(s);
-  s += p;
-  *s1 = vld1q_u8(s);
-  s += p;
-  *s2 = vld1q_u8(s);
-  s += p;
-  *s3 = vld1q_u8(s);
-  s += p;
-  *s4 = vld1q_u8(s);
-  s += p;
-  *s5 = vld1q_u8(s);
-  s += p;
-  *s6 = vld1q_u8(s);
-  s += p;
-  *s7 = vld1q_u8(s);
-}
-
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) && \
-    (__ARM_FEATURE_DOTPROD == 1)
-
-static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
-                                                const int8x16_t samples_hi,
-                                                const int32x4_t correction,
-                                                const int8x8_t filters) {
+static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
   /* Sample range-clamping and permutation are performed by the caller. */
   int32x4_t sum;
 
@@ -90,11 +33,11 @@ static INLINE int32x4_t convolve8_4_dot_partial(const int8x16_t samples_lo,
   return sum;
 }
 
-static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
-                                        const int8x8_t filters,
-                                        const int32x4_t correction,
-                                        const uint8x16_t range_limit,
-                                        const uint8x16x2_t permute_tbl) {
+static INLINE int32x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
   int8x16_t clamped_samples, permuted_samples[2];
   int32x4_t sum;
 
@@ -115,12 +58,12 @@ static INLINE int32x4_t convolve8_4_dot(uint8x16_t samples,
   return sum;
 }
 
-static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
-                                                const int8x16_t samples0_hi,
-                                                const int8x16_t samples1_lo,
-                                                const int8x16_t samples1_hi,
-                                                const int32x4_t correction,
-                                                const int8x8_t filters) {
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filters) {
   /* Sample range-clamping and permutation are performed by the caller. */
   int32x4_t sum0, sum1;
   int16x8_t sum;
@@ -138,11 +81,11 @@ static INLINE uint8x8_t convolve8_8_dot_partial(const int8x16_t samples0_lo,
   return vqrshrun_n_s16(sum, 7);
 }
 
-static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
-                                        const int8x8_t filters,
-                                        const int32x4_t correction,
-                                        const uint8x16_t range_limit,
-                                        const uint8x16x3_t permute_tbl) {
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filters,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
   int8x16_t clamped_samples, permuted_samples[3];
   int32x4_t sum0, sum1;
   int16x8_t sum;
@@ -171,15 +114,98 @@ static INLINE uint8x8_t convolve8_8_dot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+
+#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+
+static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filters, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE int32x4_t convolve8_4_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filters, 1);
+
+  /* Narrowing and packing is performed by the caller. */
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filters) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
+                                          const int8x8_t filters,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filters, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filters, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filters, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filters, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, 7);
+}
+
+#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
                                     const int16x4_t s4, const int16x4_t s5,
                                     const int16x4_t s6, const int16x4_t s7,
-                                    const int16x8_t filters,
-                                    const int16x4_t filter3,
-                                    const int16x4_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x4_t sum;
@@ -190,8 +216,8 @@ static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
   sum = vmla_lane_s16(sum, s5, filters_hi, 1);
   sum = vmla_lane_s16(sum, s6, filters_hi, 2);
   sum = vmla_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqadd_s16(sum, vmul_s16(s3, filter3));
-  sum = vqadd_s16(sum, vmul_s16(s4, filter4));
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filters_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filters_hi, 0));
   return sum;
 }
 
@@ -199,9 +225,7 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
                                     const int16x8_t s2, const int16x8_t s3,
                                     const int16x8_t s4, const int16x8_t s5,
                                     const int16x8_t s6, const int16x8_t s7,
-                                    const int16x8_t filters,
-                                    const int16x8_t filter3,
-                                    const int16x8_t filter4) {
+                                    const int16x8_t filters) {
   const int16x4_t filters_lo = vget_low_s16(filters);
   const int16x4_t filters_hi = vget_high_s16(filters);
   int16x8_t sum;
@@ -212,15 +236,13 @@ static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
   sum = vmlaq_lane_s16(sum, s5, filters_hi, 1);
   sum = vmlaq_lane_s16(sum, s6, filters_hi, 2);
   sum = vmlaq_lane_s16(sum, s7, filters_hi, 3);
-  sum = vqaddq_s16(sum, vmulq_s16(s3, filter3));
-  sum = vqaddq_s16(sum, vmulq_s16(s4, filter4));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filters_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filters_hi, 0));
   return vqrshrun_n_s16(sum, 7);
 }
 
 static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
                                        const int16x8_t filters) {
-  const int16x8_t filter3 = vdupq_lane_s16(vget_low_s16(filters), 3);
-  const int16x8_t filter4 = vdupq_lane_s16(vget_high_s16(filters), 0);
   int16x8_t ss[8];
 
   ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
@@ -233,7 +255,7 @@ static INLINE uint8x8_t scale_filter_8(const uint8x8_t *const s,
   ss[7] = vreinterpretq_s16_u16(vmovl_u8(s[7]));
 
   return convolve8_8(ss[0], ss[1], ss[2], ss[3], ss[4], ss[5], ss[6], ss[7],
-                     filters, filter3, filter4);
+                     filters);
 }
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
index 8edf8a66e..b8e3c5e54 100644
--- a/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
+++ b/libvpx/vpx_dsp/arm/vpx_scaled_convolve8_neon.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 #include "vpx_dsp/arm/vpx_convolve8_neon.h"
 #include "vpx_ports/mem.h"
@@ -38,8 +39,6 @@ static INLINE void scaledconvolve_horiz_w4(
         const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
         if (x_q4 & SUBPEL_MASK) {
           const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
-          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
           uint8x8_t s[8], d;
           int16x8_t ss[4];
           int16x4_t t[8], tt;
@@ -61,7 +60,7 @@ static INLINE void scaledconvolve_horiz_w4(
           t[7] = vget_high_s16(ss[3]);
 
           tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
-                           filters, filter3, filter4);
+                           filters);
           d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
           vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
         } else {
@@ -167,8 +166,6 @@ static INLINE void scaledconvolve_vert_w4(
 
     if (y_q4 & SUBPEL_MASK) {
       const int16x8_t filters = vld1q_s16(y_filters[y_q4 & SUBPEL_MASK]);
-      const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
-      const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
       uint8x8_t s[8], d;
       int16x4_t t[8], tt;
 
@@ -183,8 +180,7 @@ static INLINE void scaledconvolve_vert_w4(
       t[6] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[6])));
       t[7] = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(s[7])));
 
-      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters,
-                       filter3, filter4);
+      tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters);
       d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
       vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0);
     } else {
diff --git a/libvpx/vpx_dsp/avg.c b/libvpx/vpx_dsp/avg.c
index 1c45e8a73..954015407 100644
--- a/libvpx/vpx_dsp/avg.c
+++ b/libvpx/vpx_dsp/avg.c
@@ -7,6 +7,8 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <assert.h>
 #include <stdlib.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -344,6 +346,7 @@ void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref,
                        const int ref_stride, const int height) {
   int idx;
   const int norm_factor = height >> 1;
+  assert(height >= 2);
   for (idx = 0; idx < 16; ++idx) {
     int i;
     hbuf[idx] = 0;
diff --git a/libvpx/vpx_dsp/bitwriter.h b/libvpx/vpx_dsp/bitwriter.h
index 04084af8f..5f1ee69ec 100644
--- a/libvpx/vpx_dsp/bitwriter.h
+++ b/libvpx/vpx_dsp/bitwriter.h
@@ -13,6 +13,7 @@
 
 #include <stdio.h>
 
+#include "vpx_ports/compiler_attributes.h"
 #include "vpx_ports/mem.h"
 
 #include "vpx_dsp/prob.h"
@@ -35,7 +36,9 @@ typedef struct vpx_writer {
 void vpx_start_encode(vpx_writer *br, uint8_t *source);
 void vpx_stop_encode(vpx_writer *br);
 
-static INLINE void vpx_write(vpx_writer *br, int bit, int probability) {
+static INLINE VPX_NO_UNSIGNED_SHIFT_CHECK void vpx_write(vpx_writer *br,
+                                                         int bit,
+                                                         int probability) {
   unsigned int split;
   int count = br->count;
   unsigned int range = br->range;
diff --git a/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
index 2fc33b06b..77be0bb4f 100644
--- a/libvpx/vpx_dsp/loongarch/quantize_lsx.c
+++ b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
@@ -59,7 +59,6 @@ static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
 }
 
 static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
-                                   __m128i zbin_mask0, __m128i zbin_mask1,
                                    const int16_t *scan, int index,
                                    __m128i zero) {
   const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
@@ -68,8 +67,6 @@ static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
   __m128i scan1 = __lsx_vld(scan + index + 8, 0);
   __m128i eob0, eob1;
 
-  scan0 = __lsx_vsub_h(scan0, zbin_mask0);
-  scan1 = __lsx_vsub_h(scan1, zbin_mask1);
   eob0 = __lsx_vandn_v(zero_coeff0, scan0);
   eob1 = __lsx_vandn_v(zero_coeff1, scan1);
   return __lsx_vmax_h(eob0, eob1);
@@ -138,7 +135,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   dequant = __lsx_vilvh_d(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
   // AC only loop.
   while (index < n_coeffs) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -161,8 +158,7 @@ void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
     eob = __lsx_vmax_h(eob, eob0);
 
     index += 16;
@@ -221,7 +217,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
   calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
   dequant = __lsx_vilvh_d(dequant, dequant);
   calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
-  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(qcoeff0, qcoeff1, iscan, 0, zero);
   // AC only loop.
   for (index = 16; index < 32 * 32; index += 16) {
     coeff0 = __lsx_vld(coeff_ptr + index, 0);
@@ -243,8 +239,7 @@ void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
                                       dqcoeff_ptr + 8 + index);
-    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, iscan, index, zero);
     eob = __lsx_vmax_h(eob, eob0);
   }
 
diff --git a/libvpx/vpx_dsp/loopfilter.c b/libvpx/vpx_dsp/loopfilter.c
index 995602831..d6504aab1 100644
--- a/libvpx/vpx_dsp/loopfilter.c
+++ b/libvpx/vpx_dsp/loopfilter.c
@@ -159,7 +159,7 @@ void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
                            uint8_t *op3, uint8_t *op2, uint8_t *op1,
                            uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
                            uint8_t *oq2, uint8_t *oq3) {
@@ -232,8 +232,8 @@ void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
   vpx_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit1, limit1, thresh1);
 }
 
-static INLINE void filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                            uint8_t flat2, uint8_t *op7, uint8_t *op6,
+static INLINE void filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op7, uint8_t *op6,
                             uint8_t *op5, uint8_t *op4, uint8_t *op3,
                             uint8_t *op2, uint8_t *op1, uint8_t *op0,
                             uint8_t *oq0, uint8_t *oq1, uint8_t *oq2,
@@ -505,7 +505,7 @@ void vpx_highbd_lpf_vertical_4_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, uint8_t flat,
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
                                   uint16_t *op3, uint16_t *op2, uint16_t *op1,
                                   uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
                                   uint16_t *oq2, uint16_t *oq3, int bd) {
@@ -584,8 +584,8 @@ void vpx_highbd_lpf_vertical_8_dual_c(
                               bd);
 }
 
-static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, uint8_t flat,
-                                   uint8_t flat2, uint16_t *op7, uint16_t *op6,
+static INLINE void highbd_filter16(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op7, uint16_t *op6,
                                    uint16_t *op5, uint16_t *op4, uint16_t *op3,
                                    uint16_t *op2, uint16_t *op1, uint16_t *op0,
                                    uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
diff --git a/libvpx/vpx_dsp/mips/macros_msa.h b/libvpx/vpx_dsp/mips/macros_msa.h
index 3c2f50c79..d54ce5368 100644
--- a/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/libvpx/vpx_dsp/mips/macros_msa.h
@@ -83,31 +83,33 @@
     val_lh_m;                                                    \
   })
 
-#define LW(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc);          \
-    uint32_t val_lw_m;                                           \
-                                                                 \
-    __asm__ __volatile__("lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
-                         "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
-                         : [val_lw_m] "=&r"(val_lw_m)            \
-                         : [psrc_lw_m] "r"(psrc_lw_m));          \
-                                                                 \
-    val_lw_m;                                                    \
+#define LW(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_lw_m = (const uint8_t *)(psrc); \
+    uint32_t val_lw_m;                                  \
+                                                        \
+    __asm__ __volatile__(                               \
+        "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t"         \
+        "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t"         \
+        : [val_lw_m] "=&r"(val_lw_m)                    \
+        : [psrc_lw_m] "r"(psrc_lw_m));                  \
+                                                        \
+    val_lw_m;                                           \
   })
 
 #if (__mips == 64)
-#define LD(psrc)                                                 \
-  ({                                                             \
-    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc);          \
-    uint64_t val_ld_m = 0;                                       \
-                                                                 \
-    __asm__ __volatile__("ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
-                         "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
-                         : [val_ld_m] "=&r"(val_ld_m)            \
-                         : [psrc_ld_m] "r"(psrc_ld_m));          \
-                                                                 \
-    val_ld_m;                                                    \
+#define LD(psrc)                                        \
+  ({                                                    \
+    const uint8_t *psrc_ld_m = (const uint8_t *)(psrc); \
+    uint64_t val_ld_m = 0;                              \
+                                                        \
+    __asm__ __volatile__(                               \
+        "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t"         \
+        "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t"         \
+        : [val_ld_m] "=&r"(val_ld_m)                    \
+        : [psrc_ld_m] "r"(psrc_ld_m));                  \
+                                                        \
+    val_ld_m;                                           \
   })
 #else  // !(__mips == 64)
 #define LD(psrc)                                                  \
diff --git a/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libvpx/vpx_dsp/ppc/quantize_vsx.c
index 7cdcbeb40..ab71f6e23 100644
--- a/libvpx/vpx_dsp/ppc/quantize_vsx.c
+++ b/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -78,11 +78,10 @@ static INLINE int16x8_t dequantize_coeff_32(int16x8_t qcoeff,
   return (int16x8_t)vec_perm(dqcoeffe, dqcoeffo, vec_perm_odd_even_pack);
 }
 
-static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff,
                                           const int16_t *iscan_ptr, int index) {
   int16x8_t scan = vec_vsx_ld(index, iscan_ptr);
   bool16x8_t zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
-  scan = vec_sub(scan, mask);
   return vec_andc(scan, zero_coeff);
 }
 
@@ -139,8 +138,8 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dqcoeff1 = vec_mladd(qcoeff1, dequant, vec_zeros_s16);
   vec_vsx_st(dqcoeff1, 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   if (n_coeffs > 16) {
     int index = 16;
@@ -177,10 +176,9 @@ void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
       vec_vsx_st(dqcoeff1, off1, dqcoeff_ptr);
       vec_vsx_st(dqcoeff2, off2, dqcoeff_ptr);
 
-      eob =
-          vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-      eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                     nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+      eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+      eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                     nonzero_scanindex(qcoeff2, iscan_ptr, off2));
       eob = vec_max(eob, eob2);
 
       index += 24;
@@ -252,8 +250,8 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = vec_splat(dequant, 1);  // remove DC from dequant
   vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), 16, dqcoeff_ptr);
 
-  eob = vec_max(nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, 0),
-                nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, 16));
+  eob = vec_max(nonzero_scanindex(qcoeff0, iscan_ptr, 0),
+                nonzero_scanindex(qcoeff1, iscan_ptr, 16));
 
   do {
     int16x8_t coeff2, coeff2_abs, qcoeff2, eob2;
@@ -286,9 +284,9 @@ void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     vec_vsx_st(dequantize_coeff_32(qcoeff1, dequant), off1, dqcoeff_ptr);
     vec_vsx_st(dequantize_coeff_32(qcoeff2, dequant), off2, dqcoeff_ptr);
 
-    eob = vec_max(eob, nonzero_scanindex(qcoeff0, zero_mask0, iscan_ptr, off0));
-    eob2 = vec_max(nonzero_scanindex(qcoeff1, zero_mask1, iscan_ptr, off1),
-                   nonzero_scanindex(qcoeff2, zero_mask2, iscan_ptr, off2));
+    eob = vec_max(eob, nonzero_scanindex(qcoeff0, iscan_ptr, off0));
+    eob2 = vec_max(nonzero_scanindex(qcoeff1, iscan_ptr, off1),
+                   nonzero_scanindex(qcoeff2, iscan_ptr, off2));
     eob = vec_max(eob, eob2);
 
     // 24 int16_t is 48 bytes
diff --git a/libvpx/vpx_dsp/psnr.c b/libvpx/vpx_dsp/psnr.c
index 48bac0450..f0d4e927a 100644
--- a/libvpx/vpx_dsp/psnr.c
+++ b/libvpx/vpx_dsp/psnr.c
@@ -26,57 +26,44 @@ double vpx_sse_to_psnr(double samples, double peak, double sse) {
 /* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
  * and highbd_8_variance(). It should not.
  */
-static void encoder_variance(const uint8_t *a, int a_stride, const uint8_t *b,
-                             int b_stride, int w, int h, unsigned int *sse,
-                             int *sum) {
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
   int i, j;
-
-  *sum = 0;
-  *sse = 0;
+  int64_t sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
 
     a += a_stride;
     b += b_stride;
   }
+
+  return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, uint64_t *sse, int64_t *sum) {
+static int64_t encoder_highbd_8_sse(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int w,
+                                    int h) {
   int i, j;
+  int64_t sse = 0;
 
   uint16_t *a = CONVERT_TO_SHORTPTR(a8);
   uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
 
   for (i = 0; i < h; i++) {
     for (j = 0; j < w; j++) {
       const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
+      sse += diff * diff;
     }
     a += a_stride;
     b += b_stride;
   }
-}
 
-static void encoder_highbd_8_variance(const uint8_t *a8, int a_stride,
-                                      const uint8_t *b8, int b_stride, int w,
-                                      int h, unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long,
-                            &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
+  return sse;
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -85,26 +72,23 @@ static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   const int dw = width % 16;
   const int dh = height % 16;
   int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
   int x, y;
 
   if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride, dw,
-                     height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
   }
 
   if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride, width - dw, dh,
-                     &sse, &sum);
-    total_sse += sse;
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
   }
 
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
@@ -146,22 +130,19 @@ static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
   int x, y;
   const int dw = width % 16;
   const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
   if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride, &b[width - dw],
-                              b_stride, dw, height, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[width - dw], a_stride, &b[width - dw],
+                                      b_stride, dw, height);
   }
   if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
+    total_sse += encoder_highbd_8_sse(&a[(height - dh) * a_stride], a_stride,
+                                      &b[(height - dh) * b_stride], b_stride,
+                                      width - dw, dh);
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
+    unsigned int sse;
     for (x = 0; x < width / 16; ++x) {
       vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
diff --git a/libvpx/vpx_dsp/variance.c b/libvpx/vpx_dsp/variance.c
index 30b55dcb4..ce1e8382b 100644
--- a/libvpx/vpx_dsp/variance.c
+++ b/libvpx/vpx_dsp/variance.c
@@ -549,9 +549,9 @@ HIGHBD_MSE(16, 8)
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint16_t *pred,
-                              int width, int height, const uint16_t *ref,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint16_t *pred,
+                                int width, int height, const uint16_t *ref,
+                                int ref_stride) {
   int i, j;
   for (i = 0; i < height; ++i) {
     for (j = 0; j < width; ++j) {
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 13999af04..1fd9495cf 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -226,19 +226,19 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/fwd_dct32x32_impl_sse2.h
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3)  += x86/fwd_txfm_ssse3_x86_64.asm
 endif
-DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_dct32x32_impl_avx2.h
-DSP_SRCS-$(HAVE_NEON)   += arm/fdct_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct4x4_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/fdct8x8_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct16x16_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct32x32_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/fdct_partial_neon.c
-DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_AVX2)   += x86/fwd_txfm_avx2.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
@@ -326,11 +326,14 @@ DSP_SRCS-$(HAVE_SSE2)   += x86/quantize_sse2.h
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.c
 DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/quantize_avx2.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
 DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/highbd_quantize_intrin_avx2.c
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_quantize_neon.c
 endif
 
 # avg
@@ -374,6 +377,7 @@ DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/subtract_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
@@ -388,6 +392,9 @@ DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_sad_neon.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad4d_avx2.c
+DSP_SRCS-$(HAVE_AVX2) += x86/highbd_sad_avx2.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 
 endif  # CONFIG_ENCODERS
@@ -425,6 +432,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+DSP_SRCS-$(HAVE_NEON)   += arm/highbd_variance_neon.c
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index d3c668f9a..8725821b6 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -527,6 +527,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
+  specialize qw/vpx_highbd_fdct4x4_1 neon/;
+  $vpx_highbd_fdct4x4_1_neon=vpx_fdct4x4_1_neon;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8 neon sse2/;
@@ -550,27 +552,29 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   specialize qw/vpx_fdct32x32_1 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct4x4 sse2/;
+  specialize qw/vpx_highbd_fdct4x4 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct8x8 sse2/;
+  specialize qw/vpx_highbd_fdct8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_highbd_fdct8x8_1 neon/;
   $vpx_highbd_fdct8x8_1_neon=vpx_fdct8x8_1_neon;
 
   add_proto qw/void vpx_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct16x16 sse2/;
+  specialize qw/vpx_highbd_fdct16x16 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct16x16_1 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32 sse2/;
+  specialize qw/vpx_highbd_fdct32x32 sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_highbd_fdct32x32_rd sse2/;
+  specialize qw/vpx_highbd_fdct32x32_rd sse2 neon/;
 
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vpx_highbd_fdct32x32_1 neon/;
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
@@ -711,17 +715,17 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
   add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx avx2 vsx lsx/;
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx avx2 vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
+    specialize qw/vpx_highbd_quantize_b neon sse2 avx2/;
 
     add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
+    specialize qw/vpx_highbd_quantize_b_32x32 neon sse2 avx2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
@@ -730,7 +734,7 @@ if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 avx2 vsx lsx/;
 
 #
 # Single block SAD
@@ -795,7 +799,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/void vpx_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_highbd_hadamard_8x8 avx2/;
@@ -819,7 +823,7 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
     specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_32x32 sse2 avx2/;
+    specialize qw/vpx_hadamard_32x32 sse2 avx2 neon/;
 
     add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
     specialize qw/vpx_satd avx2 sse2 neon msa/;
@@ -935,46 +939,49 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Block subtraction
   #
   add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src8_ptr, ptrdiff_t src_stride, const uint8_t *pred8_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block neon avx2/;
 
   #
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64 sse2/;
+  specialize qw/vpx_highbd_sad64x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32 sse2/;
+  specialize qw/vpx_highbd_sad64x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64 sse2/;
+  specialize qw/vpx_highbd_sad32x64 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32 sse2/;
+  specialize qw/vpx_highbd_sad32x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16 sse2/;
+  specialize qw/vpx_highbd_sad32x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32 sse2/;
+  specialize qw/vpx_highbd_sad16x32 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16 sse2/;
+  specialize qw/vpx_highbd_sad16x16 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8 sse2/;
+  specialize qw/vpx_highbd_sad16x8 sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16 sse2/;
+  specialize qw/vpx_highbd_sad8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8 sse2/;
+  specialize qw/vpx_highbd_sad8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4 sse2/;
+  specialize qw/vpx_highbd_sad8x4 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x8 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vpx_highbd_sad4x4 neon/;
 
   #
   # Avg
@@ -988,83 +995,85 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max";
 
   add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg sse2/;
+  specialize qw/vpx_highbd_sad64x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg sse2/;
+  specialize qw/vpx_highbd_sad64x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg sse2/;
+  specialize qw/vpx_highbd_sad32x64_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg sse2/;
+  specialize qw/vpx_highbd_sad32x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg sse2/;
+  specialize qw/vpx_highbd_sad32x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg sse2/;
+  specialize qw/vpx_highbd_sad16x32_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg sse2/;
+  specialize qw/vpx_highbd_sad16x16_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg sse2/;
+  specialize qw/vpx_highbd_sad16x8_avg sse2 neon avx2/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg sse2/;
+  specialize qw/vpx_highbd_sad8x16_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg sse2/;
+  specialize qw/vpx_highbd_sad8x8_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg sse2/;
+  specialize qw/vpx_highbd_sad8x4_avg sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x8_avg neon/;
 
   add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_sad4x4_avg neon/;
 
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
   add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x64x4d sse2/;
+  specialize qw/vpx_highbd_sad64x64x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad64x32x4d sse2/;
+  specialize qw/vpx_highbd_sad64x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x64x4d sse2/;
+  specialize qw/vpx_highbd_sad32x64x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x32x4d sse2/;
+  specialize qw/vpx_highbd_sad32x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad32x16x4d sse2/;
+  specialize qw/vpx_highbd_sad32x16x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x32x4d sse2/;
+  specialize qw/vpx_highbd_sad16x32x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x16x4d sse2/;
+  specialize qw/vpx_highbd_sad16x16x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad16x8x4d sse2/;
+  specialize qw/vpx_highbd_sad16x8x4d sse2 neon avx2/;
 
   add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x16x4d sse2/;
+  specialize qw/vpx_highbd_sad8x16x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x8x4d sse2/;
+  specialize qw/vpx_highbd_sad8x8x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad8x4x4d sse2/;
+  specialize qw/vpx_highbd_sad8x4x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad4x8x4d sse2/;
+  specialize qw/vpx_highbd_sad4x8x4d sse2 neon/;
 
   add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
-  specialize qw/vpx_highbd_sad4x4x4d sse2/;
+  specialize qw/vpx_highbd_sad4x4x4d sse2 neon/;
 
   #
   # Structured Similarity (SSIM)
@@ -1232,369 +1241,397 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance4x4 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_variance64x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_variance64x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_variance32x64 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_variance32x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_variance32x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_variance16x32 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_variance16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_variance16x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_variance8x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_variance8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x4 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance4x4 neon/;
 
   add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get16x16var sse2/;
+  specialize qw/vpx_highbd_8_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_8_get8x8var sse2/;
+  specialize qw/vpx_highbd_8_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get16x16var sse2/;
+  specialize qw/vpx_highbd_10_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_10_get8x8var sse2/;
+  specialize qw/vpx_highbd_10_get8x8var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get16x16var sse2/;
+  specialize qw/vpx_highbd_12_get16x16var sse2 neon/;
 
   add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_highbd_12_get8x8var sse2/;
+  specialize qw/vpx_highbd_12_get8x8var sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+  specialize qw/vpx_highbd_8_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+  specialize qw/vpx_highbd_8_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+  specialize qw/vpx_highbd_10_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+  specialize qw/vpx_highbd_10_mse8x8 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+  specialize qw/vpx_highbd_12_mse16x16 sse2 neon/;
 
   add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x8 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x16 neon/;
   add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+  specialize qw/vpx_highbd_12_mse8x8 sse2 neon/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint16_t *pred, int width, int height, const uint16_t *ref, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_pred neon sse2/;
 
   #
   # Subpixel Variance
   #
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance4x4 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2/;
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4 sse2 neon/;
 
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x8 neon/;
   add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance4x4 neon/;
 
 }  # CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
index 3f4f577a2..b2e01319d 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_avx2.c
@@ -104,7 +104,7 @@ void vpx_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
   src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
-  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
 
   src32[0] = _mm256_cvtepi16_epi32(src16[0]);
   src32[1] = _mm256_cvtepi16_epi32(src16[1]);
@@ -304,7 +304,7 @@ static void hadamard_8x8x2_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
   src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
   src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
-  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
 
   hadamard_col8x2_avx2(src, 0);
   hadamard_col8x2_avx2(src, 1);
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 9da2f34c9..015c11a1f 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -164,7 +164,7 @@ unsigned int vpx_highbd_avg_8x8_sse2(const uint8_t *s8, int p) {
   s0 = _mm_add_epi32(s0, s1);
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 8));
   s0 = _mm_add_epi32(s0, _mm_srli_si128(s0, 4));
-  avg = _mm_cvtsi128_si32(s0);
+  avg = (unsigned int)_mm_cvtsi128_si32(s0);
 
   return (avg + 32) >> 6;
 }
@@ -275,7 +275,7 @@ static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
   src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
   src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
 
   hadamard_col8_sse2(src, 0);
   hadamard_col8_sse2(src, 1);
diff --git a/libvpx/vpx_dsp/x86/convolve_avx2.h b/libvpx/vpx_dsp/x86/convolve_avx2.h
index 99bc9637f..ebee964b1 100644
--- a/libvpx/vpx_dsp/x86/convolve_avx2.h
+++ b/libvpx/vpx_dsp/x86/convolve_avx2.h
@@ -129,9 +129,8 @@ static INLINE void mm256_storeu2_epi64(__m128i *const dst_ptr_1,
 static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1,
                                        __m128i *const dst_ptr_2,
                                        const __m256i *const src) {
-  *((uint32_t *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
-  *((uint32_t *)(dst_ptr_2)) =
-      _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
+  *((int *)(dst_ptr_1)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*src));
+  *((int *)(dst_ptr_2)) = _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1));
 }
 
 static INLINE __m256i mm256_round_epi32(const __m256i *const src,
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
index 3f158b5e4..f3a802029 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h
@@ -89,7 +89,7 @@ void FDCT32x32_2D_AVX2(const int16_t *input, int16_t *output_org, int stride) {
   const __m256i k__cospi_m21_p11 = pair256_set_epi16(-cospi_21_64, cospi_11_64);
   const __m256i k__cospi_m05_p27 = pair256_set_epi16(-cospi_5_64, cospi_27_64);
   const __m256i k__DCT_CONST_ROUNDING = _mm256_set1_epi32(DCT_CONST_ROUNDING);
-  const __m256i kZero = _mm256_set1_epi16(0);
+  const __m256i kZero = _mm256_setzero_si256();
   const __m256i kOne = _mm256_set1_epi16(1);
   // Do the two transform/transpose passes
   int pass;
diff --git a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
index ac1246faa..bf350b6da 100644
--- a/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
+++ b/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h
@@ -100,7 +100,7 @@ void FDCT32x32_2D(const int16_t *input, tran_low_t *output_org, int stride) {
   const __m128i k__cospi_m21_p11 = pair_set_epi16(-cospi_21_64, cospi_11_64);
   const __m128i k__cospi_m05_p27 = pair_set_epi16(-cospi_5_64, cospi_27_64);
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   const __m128i kOne = _mm_set1_epi16(1);
 
   // Do the two transform/transpose passes
diff --git a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
index 78cf9111d..1d07391b0 100644
--- a/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
+++ b/libvpx/vpx_dsp/x86/highbd_inv_txfm_sse2.h
@@ -249,7 +249,7 @@ static INLINE void highbd_idct16_4col_stage7(const __m128i *const in,
 
 static INLINE __m128i add_clamp(const __m128i in0, const __m128i in1,
                                 const int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   // Faster than _mm_set1_epi16((1 << bd) - 1).
   const __m128i one = _mm_set1_epi16(1);
   const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
diff --git a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
index d265fc1a9..9f45623de 100644
--- a/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_loopfilter_sse2.c
@@ -18,7 +18,7 @@ static INLINE __m128i signed_char_clamp_bd_sse2(__m128i value, int bd) {
   __m128i lbounded;
   __m128i retval;
 
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i t80, max, min;
 
@@ -51,7 +51,7 @@ void vpx_highbd_lpf_horizontal_16_sse2(uint16_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,
                                        const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi16(1);
   __m128i blimit_v, limit_v, thresh_v;
   __m128i q7, p7, q6, p6, q5, p5, q4, p4, q3, p3, q2, p2, q1, p1, q0, p0;
@@ -492,7 +492,7 @@ void vpx_highbd_lpf_horizontal_8_sse2(uint16_t *s, int pitch,
   DECLARE_ALIGNED(16, uint16_t, flat_oq2[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq1[16]);
   DECLARE_ALIGNED(16, uint16_t, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_load_si128((__m128i *)(s - 4 * pitch));
@@ -720,7 +720,7 @@ void vpx_highbd_lpf_horizontal_4_sse2(uint16_t *s, int pitch,
                                       const uint8_t *blimit,
                                       const uint8_t *limit,
                                       const uint8_t *thresh, int bd) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i blimit_v, limit_v, thresh_v;
   __m128i mask, hev, flat;
   __m128i p3 = _mm_loadu_si128((__m128i *)(s - 4 * pitch));
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 000000000..8edddd637
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,258 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+static VPX_FORCE_INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static VPX_FORCE_INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static VPX_FORCE_INLINE void init_qp(const int16_t *zbin_ptr,
+                                     const int16_t *round_ptr,
+                                     const int16_t *quant_ptr,
+                                     const int16_t *dequant_ptr,
+                                     const int16_t *quant_shift_ptr,
+                                     __m256i *qp, int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+    qp[0] = _mm256_add_epi32(qp[0], rnd);
+    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+    qp[1] = _mm256_add_epi32(qp[1], rnd);
+    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+                                                      const __m256i *y) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+                                                 __m256i eobmax,
+                                                 __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i nz_iscan = _mm256_and_si256(iscan, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static VPX_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+  __m256i eob_s;
+  eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+#if defined(_MSC_VER) && (_MSC_VER < 1910)
+  return _mm_cvtsi128_si32(_mm256_extracti128_si256(eob, 0)) & 0xffff;
+#else
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+#endif
+}
+
+static VPX_FORCE_INLINE void quantize(const __m256i *qp,
+                                      const tran_low_t *coeff_ptr,
+                                      const int16_t *iscan_ptr,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+    const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  (void)scan;
+
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
+
+static VPX_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static VPX_FORCE_INLINE void quantize_b_32x32(
+    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (_mm256_movemask_epi8(zbin_mask) == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  {
+    const __m256i tmp_rnd =
+        _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+    // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+    const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+    // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i abs_q = mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], 1);
+    const __m256i abs_dq =
+        _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), 1);
+    const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+    const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+    const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+    _mm256_storeu_si256((__m256i *)qcoeff, q);
+    _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+    *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+  }
+}
+
+void vpx_highbd_quantize_b_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  const unsigned int step = 8;
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  (void)scan;
+
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+  quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_b_32x32(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 4535a0f7a..ae1981a83 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -25,7 +25,7 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
                                 const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  int i, j, non_zero_regs = (int)count / 4, eob_i = 0;
   __m128i zbins[2];
   __m128i nzbins[2];
 
@@ -82,13 +82,14 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
         const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
         const uint32_t abs_qcoeff =
             (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
         dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
         if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
       }
     }
   }
-  *eob_ptr = eob_i + 1;
+  *eob_ptr = eob_i;
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
@@ -101,7 +102,7 @@ void vpx_highbd_quantize_b_32x32_sse2(
   __m128i nzbins[2];
   int idx = 0;
   int idx_arr[1024];
-  int i, eob = -1;
+  int i, eob = 0;
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
@@ -143,10 +144,10 @@ void vpx_highbd_quantize_b_32x32_sse2(
     const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
     const uint32_t abs_qcoeff =
         (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
     dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
     if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
   }
-  *eob_ptr = eob + 1;
+  *eob_ptr = eob;
 }
 #endif
diff --git a/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
new file mode 100644
index 000000000..947b5e977
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad4d_avx2.c
@@ -0,0 +1,401 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>  // AVX2
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
+                                          uint32_t sad_array[4]) {
+  const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
+  const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
+  const __m256i t2 = _mm256_hadd_epi32(t0, t1);
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t2),
+                                    _mm256_extractf128_si256(t2, 1));
+  _mm_storeu_si128((__m128i *)sad_array, sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    int x;
+
+    for (x = 0; x < 4; ++x) {
+      __m256i r[4];
+      r[0] = _mm256_loadu_si256((const __m256i *)refs[x]);
+      r[1] = _mm256_loadu_si256((const __m256i *)(refs[x] + 16));
+      r[2] = _mm256_loadu_si256((const __m256i *)(refs[x] + 32));
+      r[3] = _mm256_loadu_si256((const __m256i *)(refs[x] + 48));
+
+      // absolute differences between every ref[] to src
+      r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s0));
+      r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s1));
+      r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s2));
+      r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s3));
+
+      // sum every abs diff
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[0], r[1]));
+      sums_16[x] = _mm256_add_epi16(sums_16[x], _mm256_add_epi16(r[2], r[3]));
+    }
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD64XNX4D(n)                                                   \
+  void vpx_highbd_sad64x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
+    uint16_t *refs[4];                                                         \
+    __m256i sums_16[4];                                                        \
+    __m256i sums_32[4];                                                        \
+    int i;                                                                     \
+                                                                               \
+    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
+    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
+    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
+    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
+    sums_32[0] = _mm256_setzero_si256();                                       \
+    sums_32[1] = _mm256_setzero_si256();                                       \
+    sums_32[2] = _mm256_setzero_si256();                                       \
+    sums_32[3] = _mm256_setzero_si256();                                       \
+                                                                               \
+    for (i = 0; i < (n / 2); ++i) {                                            \
+      sums_16[0] = _mm256_setzero_si256();                                     \
+      sums_16[1] = _mm256_setzero_si256();                                     \
+      sums_16[2] = _mm256_setzero_si256();                                     \
+      sums_16[3] = _mm256_setzero_si256();                                     \
+                                                                               \
+      highbd_sad64xHx4d(sums_16, src, src_stride, refs, ref_stride, 2);        \
+                                                                               \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to         \
+       * sums_32*/                                                             \
+      sums_32[0] = _mm256_add_epi32(                                           \
+          sums_32[0],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
+      sums_32[1] = _mm256_add_epi32(                                           \
+          sums_32[1],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
+      sums_32[2] = _mm256_add_epi32(                                           \
+          sums_32[2],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
+      sums_32[3] = _mm256_add_epi32(                                           \
+          sums_32[3],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
+                                                                               \
+      src += src_stride << 1;                                                  \
+    }                                                                          \
+    calc_final_4(sums_32, sad_array);                                          \
+  }
+
+// 64x64
+HIGHBD_SAD64XNX4D(64)
+
+// 64x32
+HIGHBD_SAD64XNX4D(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[8];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 16));
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)(refs[0] + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[3] = _mm256_loadu_si256((const __m256i *)(refs[1] + 16));
+    r[4] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[5] = _mm256_loadu_si256((const __m256i *)(refs[2] + 16));
+    r[6] = _mm256_loadu_si256((const __m256i *)refs[3]);
+    r[7] = _mm256_loadu_si256((const __m256i *)(refs[3] + 16));
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s2));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s2));
+    r[4] = _mm256_abs_epi16(_mm256_sub_epi16(r[4], s));
+    r[5] = _mm256_abs_epi16(_mm256_sub_epi16(r[5], s2));
+    r[6] = _mm256_abs_epi16(_mm256_sub_epi16(r[6], s));
+    r[7] = _mm256_abs_epi16(_mm256_sub_epi16(r[7], s2));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], _mm256_add_epi16(r[0], r[1]));
+    sums_16[1] = _mm256_add_epi16(sums_16[1], _mm256_add_epi16(r[2], r[3]));
+    sums_16[2] = _mm256_add_epi16(sums_16[2], _mm256_add_epi16(r[4], r[5]));
+    sums_16[3] = _mm256_add_epi16(sums_16[3], _mm256_add_epi16(r[6], r[7]));
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD32XNX4D(n)                                                   \
+  void vpx_highbd_sad32x##n##x4d_avx2(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                        \
+    uint16_t *refs[4];                                                         \
+    __m256i sums_16[4];                                                        \
+    __m256i sums_32[4];                                                        \
+    int i;                                                                     \
+                                                                               \
+    refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);                               \
+    refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);                               \
+    refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);                               \
+    refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);                               \
+    sums_32[0] = _mm256_setzero_si256();                                       \
+    sums_32[1] = _mm256_setzero_si256();                                       \
+    sums_32[2] = _mm256_setzero_si256();                                       \
+    sums_32[3] = _mm256_setzero_si256();                                       \
+                                                                               \
+    for (i = 0; i < (n / 8); ++i) {                                            \
+      sums_16[0] = _mm256_setzero_si256();                                     \
+      sums_16[1] = _mm256_setzero_si256();                                     \
+      sums_16[2] = _mm256_setzero_si256();                                     \
+      sums_16[3] = _mm256_setzero_si256();                                     \
+                                                                               \
+      highbd_sad32xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);        \
+                                                                               \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to         \
+       * sums_32*/                                                             \
+      sums_32[0] = _mm256_add_epi32(                                           \
+          sums_32[0],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[0], 1))));                  \
+      sums_32[1] = _mm256_add_epi32(                                           \
+          sums_32[1],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[1], 1))));                  \
+      sums_32[2] = _mm256_add_epi32(                                           \
+          sums_32[2],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[2], 1))));                  \
+      sums_32[3] = _mm256_add_epi32(                                           \
+          sums_32[3],                                                          \
+          _mm256_add_epi32(                                                    \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),       \
+              _mm256_cvtepu16_epi32(                                           \
+                  _mm256_extractf128_si256(sums_16[3], 1))));                  \
+                                                                               \
+      src += src_stride << 3;                                                  \
+    }                                                                          \
+    calc_final_4(sums_32, sad_array);                                          \
+  }
+
+// 32x64
+HIGHBD_SAD32XNX4D(64)
+
+// 32x32
+HIGHBD_SAD32XNX4D(32)
+
+// 32x16
+HIGHBD_SAD32XNX4D(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xHx4d(__m256i *sums_16 /*[4]*/,
+                                               const uint16_t *src,
+                                               int src_stride,
+                                               uint16_t *refs[4],
+                                               int ref_stride, int height) {
+  int i;
+  for (i = 0; i < height; i++) {
+    __m256i r[4];
+
+    // load src and all ref[]
+    const __m256i s = _mm256_load_si256((const __m256i *)src);
+    r[0] = _mm256_loadu_si256((const __m256i *)refs[0]);
+    r[1] = _mm256_loadu_si256((const __m256i *)refs[1]);
+    r[2] = _mm256_loadu_si256((const __m256i *)refs[2]);
+    r[3] = _mm256_loadu_si256((const __m256i *)refs[3]);
+
+    // absolute differences between every ref[] to src
+    r[0] = _mm256_abs_epi16(_mm256_sub_epi16(r[0], s));
+    r[1] = _mm256_abs_epi16(_mm256_sub_epi16(r[1], s));
+    r[2] = _mm256_abs_epi16(_mm256_sub_epi16(r[2], s));
+    r[3] = _mm256_abs_epi16(_mm256_sub_epi16(r[3], s));
+
+    // sum every abs diff
+    sums_16[0] = _mm256_add_epi16(sums_16[0], r[0]);
+    sums_16[1] = _mm256_add_epi16(sums_16[1], r[1]);
+    sums_16[2] = _mm256_add_epi16(sums_16[2], r[2]);
+    sums_16[3] = _mm256_add_epi16(sums_16[3], r[3]);
+
+    src += src_stride;
+    refs[0] += ref_stride;
+    refs[1] += ref_stride;
+    refs[2] += ref_stride;
+    refs[3] += ref_stride;
+  }
+}
+
+void vpx_highbd_sad16x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+  __m256i sums_32[4];
+  int i;
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_32[0] = _mm256_setzero_si256();
+  sums_32[1] = _mm256_setzero_si256();
+  sums_32[2] = _mm256_setzero_si256();
+  sums_32[3] = _mm256_setzero_si256();
+
+  for (i = 0; i < 2; ++i) {
+    sums_16[0] = _mm256_setzero_si256();
+    sums_16[1] = _mm256_setzero_si256();
+    sums_16[2] = _mm256_setzero_si256();
+    sums_16[3] = _mm256_setzero_si256();
+
+    highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32[0] = _mm256_add_epi32(
+        sums_32[0],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1))));
+    sums_32[1] = _mm256_add_epi32(
+        sums_32[1],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1))));
+    sums_32[2] = _mm256_add_epi32(
+        sums_32[2],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1))));
+    sums_32[3] = _mm256_add_epi32(
+        sums_32[3],
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1))));
+
+    src += src_stride << 4;
+  }
+  calc_final_4(sums_32, sad_array);
+}
+
+void vpx_highbd_sad16x16x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *const ref_array[4],
+                                 int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 16);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
+
+void vpx_highbd_sad16x8x4d_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *const ref_array[4],
+                                int ref_stride, uint32_t sad_array[4]) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *refs[4];
+  __m256i sums_16[4];
+
+  refs[0] = CONVERT_TO_SHORTPTR(ref_array[0]);
+  refs[1] = CONVERT_TO_SHORTPTR(ref_array[1]);
+  refs[2] = CONVERT_TO_SHORTPTR(ref_array[2]);
+  refs[3] = CONVERT_TO_SHORTPTR(ref_array[3]);
+  sums_16[0] = _mm256_setzero_si256();
+  sums_16[1] = _mm256_setzero_si256();
+  sums_16[2] = _mm256_setzero_si256();
+  sums_16[3] = _mm256_setzero_si256();
+
+  highbd_sad16xHx4d(sums_16, src, src_stride, refs, ref_stride, 8);
+
+  {
+    __m256i sums_32[4];
+    sums_32[0] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[0])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[0], 1)));
+    sums_32[1] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[1])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[1], 1)));
+    sums_32[2] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[2])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[2], 1)));
+    sums_32[3] = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16[3])),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16[3], 1)));
+    calc_final_4(sums_32, sad_array);
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_sad_avx2.c b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 000000000..231b67f80
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,468 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include <immintrin.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE unsigned int calc_final(const __m256i sums_32) {
+  const __m256i t0 = _mm256_add_epi32(sums_32, _mm256_srli_si256(sums_32, 8));
+  const __m256i t1 = _mm256_add_epi32(t0, _mm256_srli_si256(t0, 4));
+  const __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(t1),
+                                    _mm256_extractf128_si256(t1, 1));
+  return (unsigned int)_mm_cvtsi128_si32(sum);
+}
+
+static VPX_FORCE_INLINE void highbd_sad64xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(r2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(r3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD64XN(n)                                                    \
+  unsigned int vpx_highbd_sad64x##n##_avx2(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    __m256i sums_32 = _mm256_setzero_si256();                                \
+    int i;                                                                   \
+                                                                             \
+    for (i = 0; i < (n / 2); ++i) {                                          \
+      __m256i sums_16 = _mm256_setzero_si256();                              \
+                                                                             \
+      highbd_sad64xH(&sums_16, src, src_stride, ref, ref_stride, 2);         \
+                                                                             \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to       \
+       * sums_32*/                                                           \
+      sums_32 = _mm256_add_epi32(                                            \
+          sums_32,                                                           \
+          _mm256_add_epi32(                                                  \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+                                                                             \
+      src += src_stride << 1;                                                \
+      ref += ref_stride << 1;                                                \
+    }                                                                        \
+    return calc_final(sums_32);                                              \
+  }
+
+// 64x64
+HIGHBD_SAD64XN(64)
+
+// 64x32
+HIGHBD_SAD64XN(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define HIGHBD_SAD32XN(n)                                                    \
+  unsigned int vpx_highbd_sad32x##n##_avx2(                                  \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                      \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    __m256i sums_32 = _mm256_setzero_si256();                                \
+    int i;                                                                   \
+                                                                             \
+    for (i = 0; i < (n / 8); ++i) {                                          \
+      __m256i sums_16 = _mm256_setzero_si256();                              \
+                                                                             \
+      highbd_sad32xH(&sums_16, src, src_stride, ref, ref_stride, 8);         \
+                                                                             \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to       \
+       * sums_32*/                                                           \
+      sums_32 = _mm256_add_epi32(                                            \
+          sums_32,                                                           \
+          _mm256_add_epi32(                                                  \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),        \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)))); \
+                                                                             \
+      src += src_stride << 3;                                                \
+      ref += ref_stride << 3;                                                \
+    }                                                                        \
+    return calc_final(sums_32);                                              \
+  }
+
+// 32x64
+HIGHBD_SAD32XN(64)
+
+// 32x32
+HIGHBD_SAD32XN(32)
+
+// 32x16
+HIGHBD_SAD32XN(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH(__m256i *sums_16,
+                                            const uint16_t *src, int src_stride,
+                                            uint16_t *ref, int ref_stride,
+                                            int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(r0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(r1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH(&sums_16, src, src_stride, ref, ref_stride, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+// AVG -------------------------------------------------------------------------
+static VPX_FORCE_INLINE void highbd_sad64xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i s2 = _mm256_load_si256((const __m256i *)(src + 32));
+    const __m256i s3 = _mm256_load_si256((const __m256i *)(src + 48));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i r2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+    const __m256i r3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i x2 = _mm256_loadu_si256((const __m256i *)(sec + 32));
+    const __m256i x3 = _mm256_loadu_si256((const __m256i *)(sec + 48));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    const __m256i avg2 = _mm256_avg_epu16(r2, x2);
+    const __m256i avg3 = _mm256_avg_epu16(r3, x3);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    const __m256i abs_diff2 = _mm256_abs_epi16(_mm256_sub_epi16(avg2, s2));
+    const __m256i abs_diff3 = _mm256_abs_epi16(_mm256_sub_epi16(avg3, s3));
+    // sum every abs diff
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff0, abs_diff1));
+    *sums_16 =
+        _mm256_add_epi16(*sums_16, _mm256_add_epi16(abs_diff2, abs_diff3));
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 64;
+  }
+}
+
+#define HIGHBD_SAD64XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad64x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 2); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad64xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 2); \
+                                                                              \
+      /* sums_16 will outrange after 2 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 1;                                                 \
+      ref += ref_stride << 1;                                                 \
+      sec += 64 << 1;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 64x64
+HIGHBD_SAD64XN_AVG(64)
+
+// 64x32
+HIGHBD_SAD64XN_AVG(32)
+
+static VPX_FORCE_INLINE void highbd_sad32xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + 16));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref/pred avg to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride;
+    ref += ref_stride;
+    sec += 32;
+  }
+}
+
+#define HIGHBD_SAD32XN_AVG(n)                                                 \
+  unsigned int vpx_highbd_sad32x##n##_avg_avx2(                               \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                       \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                             \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);                         \
+    __m256i sums_32 = _mm256_setzero_si256();                                 \
+    int i;                                                                    \
+                                                                              \
+    for (i = 0; i < (n / 8); ++i) {                                           \
+      __m256i sums_16 = _mm256_setzero_si256();                               \
+                                                                              \
+      highbd_sad32xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8); \
+                                                                              \
+      /* sums_16 will outrange after 8 rows, so add current sums_16 to        \
+       * sums_32*/                                                            \
+      sums_32 = _mm256_add_epi32(                                             \
+          sums_32,                                                            \
+          _mm256_add_epi32(                                                   \
+              _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),         \
+              _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));  \
+                                                                              \
+      src += src_stride << 3;                                                 \
+      ref += ref_stride << 3;                                                 \
+      sec += 32 << 3;                                                         \
+    }                                                                         \
+    return calc_final(sums_32);                                               \
+  }
+
+// 32x64
+HIGHBD_SAD32XN_AVG(64)
+
+// 32x32
+HIGHBD_SAD32XN_AVG(32)
+
+// 32x16
+HIGHBD_SAD32XN_AVG(16)
+
+static VPX_FORCE_INLINE void highbd_sad16xH_avg(__m256i *sums_16,
+                                                const uint16_t *src,
+                                                int src_stride, uint16_t *ref,
+                                                int ref_stride, uint16_t *sec,
+                                                int height) {
+  int i;
+  for (i = 0; i < height; i += 2) {
+    // load src and all ref[]
+    const __m256i s0 = _mm256_load_si256((const __m256i *)src);
+    const __m256i s1 = _mm256_load_si256((const __m256i *)(src + src_stride));
+    const __m256i r0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i r1 = _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)sec);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)(sec + 16));
+    const __m256i avg0 = _mm256_avg_epu16(r0, x0);
+    const __m256i avg1 = _mm256_avg_epu16(r1, x1);
+    // absolute differences between every ref[] to src
+    const __m256i abs_diff0 = _mm256_abs_epi16(_mm256_sub_epi16(avg0, s0));
+    const __m256i abs_diff1 = _mm256_abs_epi16(_mm256_sub_epi16(avg1, s1));
+    // sum every abs diff
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff0);
+    *sums_16 = _mm256_add_epi16(*sums_16, abs_diff1);
+
+    src += src_stride << 1;
+    ref += ref_stride << 1;
+    sec += 32;
+  }
+}
+
+unsigned int vpx_highbd_sad16x32_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_32 = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < 2; ++i) {
+    __m256i sums_16 = _mm256_setzero_si256();
+
+    highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+    // sums_16 will outrange after 16 rows, so add current sums_16 to sums_32
+    sums_32 = _mm256_add_epi32(
+        sums_32,
+        _mm256_add_epi32(
+            _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+            _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1))));
+
+    src += src_stride << 4;
+    ref += ref_stride << 4;
+    sec += 16 << 4;
+  }
+  return calc_final(sums_32);
+}
+
+unsigned int vpx_highbd_sad16x16_avg_avx2(const uint8_t *src_ptr,
+                                          int src_stride,
+                                          const uint8_t *ref_ptr,
+                                          int ref_stride,
+                                          const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 16);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
+
+unsigned int vpx_highbd_sad16x8_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         const uint8_t *second_pred) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16_t *sec = CONVERT_TO_SHORTPTR(second_pred);
+  __m256i sums_16 = _mm256_setzero_si256();
+
+  highbd_sad16xH_avg(&sums_16, src, src_stride, ref, ref_stride, sec, 8);
+
+  {
+    const __m256i sums_32 = _mm256_add_epi32(
+        _mm256_cvtepu16_epi32(_mm256_castsi256_si128(sums_16)),
+        _mm256_cvtepu16_epi32(_mm256_extractf128_si256(sums_16, 1)));
+    return calc_final(sums_32);
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index 7c8d79b09..381e0ad19 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <emmintrin.h>  // SSE2
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -559,3 +560,49 @@ FNS(sse2)
 
 #undef FNS
 #undef FN
+
+void vpx_highbd_comp_avg_pred_sse2(uint16_t *comp_pred, const uint16_t *pred,
+                                   int width, int height, const uint16_t *ref,
+                                   int ref_stride) {
+  int i, j;
+  if (width > 8) {
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[j]);
+        const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[j + 8]);
+        const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[j]);
+        const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[j + 8]);
+        _mm_storeu_si128((__m128i *)&comp_pred[j], _mm_avg_epu16(p0, r0));
+        _mm_storeu_si128((__m128i *)&comp_pred[j + 8], _mm_avg_epu16(p1, r1));
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadu_si128((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadu_si128((const __m128i *)&pred[8]);
+      const __m128i r0 = _mm_loadu_si128((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadu_si128((const __m128i *)&ref[ref_stride]);
+      _mm_storeu_si128((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storeu_si128((__m128i *)&comp_pred[8], _mm_avg_epu16(p1, r1));
+      comp_pred += 8 << 1;
+      pred += 8 << 1;
+      ref += ref_stride << 1;
+    }
+  } else {
+    assert(width == 4);
+    for (i = 0; i < height; i += 2) {
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)&pred[0]);
+      const __m128i p1 = _mm_loadl_epi64((const __m128i *)&pred[4]);
+      const __m128i r0 = _mm_loadl_epi64((const __m128i *)&ref[0]);
+      const __m128i r1 = _mm_loadl_epi64((const __m128i *)&ref[ref_stride]);
+      _mm_storel_epi64((__m128i *)&comp_pred[0], _mm_avg_epu16(p0, r0));
+      _mm_storel_epi64((__m128i *)&comp_pred[4], _mm_avg_epu16(p1, r1));
+      comp_pred += 4 << 1;
+      pred += 4 << 1;
+      ref += ref_stride << 1;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index 4b02da966..f42b3df84 100644
--- a/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -243,7 +243,7 @@ void iadst8_sse2(__m128i *const in) {
   const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
   __m128i s[8], u[16], v[8], w[16];
 
   // transpose
@@ -546,7 +546,7 @@ void vpx_iadst16_8col_sse2(__m128i *const in) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
-  const __m128i kZero = _mm_set1_epi16(0);
+  const __m128i kZero = _mm_setzero_si128();
 
   u[0] = _mm_unpacklo_epi16(in[15], in[0]);
   u[1] = _mm_unpackhi_epi16(in[15], in[0]);
diff --git a/libvpx/vpx_dsp/x86/loopfilter_avx2.c b/libvpx/vpx_dsp/x86/loopfilter_avx2.c
index be391992a..a58fb6553 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_avx2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_avx2.c
@@ -18,7 +18,7 @@ void vpx_lpf_horizontal_16_avx2(unsigned char *s, int pitch,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
@@ -372,7 +372,7 @@ void vpx_lpf_horizontal_16_dual_avx2(unsigned char *s, int pitch,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
   __m128i mask, hev, flat, flat2;
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index 347c9fdbe..6ea34cdd1 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -106,7 +106,7 @@ static INLINE __m128i abs_diff(__m128i a, __m128i b) {
 
 void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                                const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -140,7 +140,7 @@ void vpx_lpf_horizontal_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
 
 void vpx_lpf_vertical_4_sse2(uint8_t *s, int pitch, const uint8_t *blimit,
                              const uint8_t *limit, const uint8_t *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i limit_v =
       _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)blimit),
                          _mm_loadl_epi64((const __m128i *)limit));
@@ -232,7 +232,7 @@ void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
                                 const unsigned char *blimit,
                                 const unsigned char *limit,
                                 const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -594,7 +594,7 @@ void vpx_lpf_horizontal_16_dual_sse2(unsigned char *s, int pitch,
                                      const unsigned char *blimit,
                                      const unsigned char *limit,
                                      const unsigned char *thresh) {
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i one = _mm_set1_epi8(1);
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
@@ -932,7 +932,7 @@ void vpx_lpf_horizontal_8_sse2(unsigned char *s, int pitch,
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit_v = _mm_load_si128((const __m128i *)blimit);
   const __m128i limit_v = _mm_load_si128((const __m128i *)limit);
   const __m128i thresh_v = _mm_load_si128((const __m128i *)thresh);
@@ -1152,7 +1152,7 @@ void vpx_lpf_horizontal_8_dual_sse2(
   DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
   DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   const __m128i blimit =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)blimit0),
                          _mm_load_si128((const __m128i *)blimit1));
@@ -1406,7 +1406,7 @@ void vpx_lpf_horizontal_4_dual_sse2(unsigned char *s, int pitch,
   const __m128i thresh =
       _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)thresh0),
                          _mm_load_si128((const __m128i *)thresh1));
-  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i zero = _mm_setzero_si128();
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
   __m128i mask, hev, flat;
 
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
index 8b6d4d1dd..031f361a4 100644
--- a/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -27,13 +27,13 @@ static INLINE int32_t loadu_int32(const void *src) {
 }
 
 static INLINE __m128i load_unaligned_u32(const void *a) {
-  uint32_t val;
+  int val;
   memcpy(&val, a, sizeof(val));
   return _mm_cvtsi32_si128(val);
 }
 
 static INLINE void store_unaligned_u32(void *const a, const __m128i v) {
-  const uint32_t val = _mm_cvtsi128_si32(v);
+  const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
 }
 
diff --git a/libvpx/vpx_dsp/x86/post_proc_sse2.c b/libvpx/vpx_dsp/x86/post_proc_sse2.c
index d1029afc4..119fa7cd1 100644
--- a/libvpx/vpx_dsp/x86/post_proc_sse2.c
+++ b/libvpx/vpx_dsp/x86/post_proc_sse2.c
@@ -36,7 +36,7 @@ void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
     __m128i s = _mm_loadl_epi64((__m128i *)dst);
     __m128i sum, sumsq_0, sumsq_1;
     __m128i tmp_0, tmp_1;
-    __m128i below_context;
+    __m128i below_context = _mm_setzero_si128();
 
     s = _mm_unpacklo_epi8(s, zero);
 
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
index 706e4e641..7d8352721 100644
--- a/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -93,8 +93,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -134,8 +133,7 @@ void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
@@ -229,8 +227,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -272,8 +269,7 @@ void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/libvpx/vpx_dsp/x86/quantize_avx2.c b/libvpx/vpx_dsp/x86/quantize_avx2.c
new file mode 100644
index 000000000..28f7c9c7d
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/quantize_avx2.c
@@ -0,0 +1,293 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift, int log_scale) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *zbin = _mm256_add_epi16(*zbin, rnd);
+    *zbin = _mm256_srai_epi16(*zbin, log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *round = _mm256_add_epi16(*round, rnd);
+    *round = _mm256_srai_epi16(*round, log_scale);
+  }
+
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static VPX_FORCE_INLINE __m256i
+load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+#else
+  // typedef int16_t tran_low_t;
+  return _mm256_loadu_si256((const __m256i *)coeff_ptr);
+#endif
+}
+
+static VPX_FORCE_INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                                     tran_low_t *coeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  // typedef int32_t tran_low_t;
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_storeu_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+#else
+  // typedef int16_t tran_low_t;
+  _mm256_storeu_si256((__m256i *)coeff_ptr, coeff_vals);
+#endif
+}
+
+static VPX_FORCE_INLINE __m256i
+quantize_b_16(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+              tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+              __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_storeu_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_storeu_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return _mm256_setzero_si256();
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+#if CONFIG_VP9_HIGHBITDEPTH
+    const __m256i low = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_qcoeff, *v_dequant);
+
+    const __m256i v_dqcoeff_lo = _mm256_unpacklo_epi16(low, high);
+    const __m256i v_dqcoeff_hi = _mm256_unpackhi_epi16(low, high);
+#else
+    const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+#endif
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+#endif
+    return v_nz_mask;
+  }
+}
+
+static VPX_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan,
+                                                 __m256i v_eobmax,
+                                                 __m256i v_mask) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  const __m256i v_iscan = _mm256_permute4x64_epi64(
+      _mm256_loadu_si256((const __m256i *)iscan), 0xD8);
+#else
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+#endif
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static VPX_FORCE_INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+void vpx_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift, v_nz_mask;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  (void)scan;
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, 0);
+  // Do DC and first 15 AC.
+  v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                            &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask = quantize_b_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                              &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static VPX_FORCE_INLINE __m256i quantize_b_32x32_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *iscan, __m256i *v_quant,
+    __m256i *v_dequant, __m256i *v_round, __m256i *v_zbin,
+    __m256i *v_quant_shift, __m256i *v_eobmax) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+#endif
+    return *v_eobmax;
+  }
+  {
+    // tmp = v_zbin_mask ? (int64_t)abs_coeff + round : 0
+    const __m256i v_tmp_rnd =
+        _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+    //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+    //                 quant_shift_ptr[rc != 0]) >> 15);
+    const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+    const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+    const __m256i v_tmp32_hi =
+        _mm256_slli_epi16(_mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), 1);
+    const __m256i v_tmp32_lo =
+        _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 15);
+    const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+    const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+    const __m256i v_sign_lo =
+        _mm256_unpacklo_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i v_sign_hi =
+        _mm256_unpackhi_epi16(_mm256_setzero_si256(), v_coeff);
+    const __m256i low = _mm256_mullo_epi16(v_tmp32, *v_dequant);
+    const __m256i high = _mm256_mulhi_epi16(v_tmp32, *v_dequant);
+    const __m256i v_dqcoeff_lo = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpacklo_epi16(low, high), 1), v_sign_lo);
+    const __m256i v_dqcoeff_hi = _mm256_sign_epi32(
+        _mm256_srli_epi32(_mm256_unpackhi_epi16(low, high), 1), v_sign_hi);
+    const __m256i v_nz_mask =
+        _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+
+    store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr), v_dqcoeff_lo);
+    _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + 8), v_dqcoeff_hi);
+#else
+    store_coefficients_avx2(_mm256_packs_epi32(v_dqcoeff_lo, v_dqcoeff_hi),
+                            dqcoeff_ptr);
+#endif
+
+    return get_max_lane_eob(iscan, *v_eobmax, v_nz_mask);
+  }
+}
+
+void vpx_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+  intptr_t count;
+  (void)n_coeffs;
+  (void)scan;
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, 1);
+
+  // Do DC and first 15 AC.
+  v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                 &v_quant, &v_dequant, &v_round, &v_zbin,
+                                 &v_quant_shift, &v_eobmax);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (count = (32 * 32) - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_eobmax = quantize_b_32x32_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, iscan,
+                                   &v_quant, &v_dequant, &v_round, &v_zbin,
+                                   &v_quant_shift, &v_eobmax);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index 459d95f28..9533e7916 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -76,7 +76,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -106,8 +106,7 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.h b/libvpx/vpx_dsp/x86/quantize_sse2.h
index afe2f924b..27bfb4e41 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.h
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.h
@@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+                                  const int16_t *quant_ptr, __m128i *quant,
+                                  const int16_t *dequant_ptr,
+                                  __m128i *dequant) {
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
 // With ssse3 and later abs() and sign() are preferred.
 static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);
@@ -62,11 +71,8 @@ static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
-// Scan 16 values for eob reference in scan. Use masks (-1) from comparing to
-// zbin to add 1 to the index in 'scan'.
+// Scan 16 values for eob reference in scan.
 static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
-                                   const __m128i zbin_mask0,
-                                   const __m128i zbin_mask1,
                                    const int16_t *scan, const int index,
                                    const __m128i zero) {
   const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
@@ -74,9 +80,6 @@ static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
   __m128i scan0 = _mm_load_si128((const __m128i *)(scan + index));
   __m128i scan1 = _mm_load_si128((const __m128i *)(scan + index + 8));
   __m128i eob0, eob1;
-  // Add one to convert from indices to counts
-  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
-  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
   eob0 = _mm_andnot_si128(zero_coeff0, scan0);
   eob1 = _mm_andnot_si128(zero_coeff1, scan1);
   return _mm_max_epi16(eob0, eob1);
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
index 9d2a88b7b..476230286 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -70,7 +70,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   dequant = _mm_unpackhi_epi64(dequant, dequant);
   calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
 
-  eob = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
 
   // AC only loop.
   while (index < n_coeffs) {
@@ -98,8 +98,7 @@ void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
     calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
 
     index += 16;
@@ -202,8 +201,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     dequant = _mm_unpackhi_epi64(dequant, dequant);
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
 
-    eob =
-        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan, 0, zero);
   }
 
   // AC only loop.
@@ -249,8 +247,7 @@ void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, zero,
                                       dqcoeff_ptr + 8 + index);
 
-    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
-                        zero);
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, iscan, index, zero);
     eob = _mm_max_epi16(eob, eob0);
   }
 
diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
index 3b48acd51..29bedb0e6 100644
--- a/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -14,7 +14,7 @@
 #define FSAD64_H(h)                                                           \
   unsigned int vpx_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
                                     const uint8_t *ref_ptr, int ref_stride) { \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -35,8 +35,7 @@
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSAD32_H(h)                                                           \
@@ -92,7 +91,7 @@ FSAD32
   unsigned int vpx_sad64x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -118,15 +117,14 @@ FSAD32
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG32_H(h)                                                        \
   unsigned int vpx_sad32x##h##_avg_avx2(                                      \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
       int ref_stride, const uint8_t *second_pred) {                           \
-    int i, res;                                                               \
+    int i;                                                                    \
     __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
     __m256i sum_sad = _mm256_setzero_si256();                                 \
     __m256i sum_sad_h;                                                        \
@@ -156,8 +154,7 @@ FSAD32
     sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
     sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
     sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
-    res = _mm_cvtsi128_si32(sum_sad128);                                      \
-    return res;                                                               \
+    return (unsigned int)_mm_cvtsi128_si32(sum_sad128);                       \
   }
 
 #define FSADAVG64 \
diff --git a/libvpx/vpx_dsp/x86/subtract_avx2.c b/libvpx/vpx_dsp/x86/subtract_avx2.c
new file mode 100644
index 000000000..4849581ed
--- /dev/null
+++ b/libvpx/vpx_dsp/x86/subtract_avx2.c
@@ -0,0 +1,203 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+static VPX_FORCE_INLINE void subtract32_avx2(int16_t *diff_ptr,
+                                             const uint8_t *src_ptr,
+                                             const uint8_t *pred_ptr) {
+  const __m256i s = _mm256_lddqu_si256((const __m256i *)src_ptr);
+  const __m256i p = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+  const __m256i s_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s));
+  const __m256i s_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s, 1));
+  const __m256i p_0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(p));
+  const __m256i p_1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(p, 1));
+  const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+  const __m256i d_1 = _mm256_sub_epi16(s_1, p_1);
+  _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+  _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d_1);
+}
+
+static VPX_FORCE_INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    const __m128i s = _mm_lddqu_si128((const __m128i *)src_ptr);
+    const __m128i p = _mm_lddqu_si128((const __m128i *)pred_ptr);
+    const __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    const __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_storeu_si256((__m256i *)diff_ptr, d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static VPX_FORCE_INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  int j;
+  for (j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void vpx_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    default:
+      vpx_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                                    ptrdiff_t diff_stride,
+                                    const uint8_t *src8_ptr,
+                                    ptrdiff_t src_stride,
+                                    const uint8_t *pred8_ptr,
+                                    ptrdiff_t pred_stride, int bd) {
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8_ptr);
+  uint16_t *pred_ptr = CONVERT_TO_SHORTPTR(pred8_ptr);
+  (void)bd;
+  if (cols == 64) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i s2 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 32));
+      const __m256i s3 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 48));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i p2 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 32));
+      const __m256i p3 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 48));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      const __m256i d2 = _mm256_sub_epi16(s2, p2);
+      const __m256i d3 = _mm256_sub_epi16(s3, p3);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 32), d2);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 48), d3);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 32) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 = _mm256_lddqu_si256((const __m256i *)(src_ptr + 16));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 = _mm256_lddqu_si256((const __m256i *)(pred_ptr + 16));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + 16), d1);
+      src_ptr += src_stride;
+      pred_ptr += pred_stride;
+      diff_ptr += diff_stride;
+    } while (--j != 0);
+  } else if (cols == 16) {
+    int j = rows;
+    do {
+      const __m256i s0 = _mm256_lddqu_si256((const __m256i *)src_ptr);
+      const __m256i s1 =
+          _mm256_lddqu_si256((const __m256i *)(src_ptr + src_stride));
+      const __m256i p0 = _mm256_lddqu_si256((const __m256i *)pred_ptr);
+      const __m256i p1 =
+          _mm256_lddqu_si256((const __m256i *)(pred_ptr + pred_stride));
+      const __m256i d0 = _mm256_sub_epi16(s0, p0);
+      const __m256i d1 = _mm256_sub_epi16(s1, p1);
+      _mm256_storeu_si256((__m256i *)diff_ptr, d0);
+      _mm256_storeu_si256((__m256i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else if (cols == 8) {
+    int j = rows;
+    do {
+      const __m128i s0 = _mm_lddqu_si128((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_lddqu_si128((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_lddqu_si128((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_lddqu_si128((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storeu_si128((__m128i *)diff_ptr, d0);
+      _mm_storeu_si128((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  } else {
+    int j = rows;
+    assert(cols == 4);
+    do {
+      const __m128i s0 = _mm_loadl_epi64((const __m128i *)src_ptr);
+      const __m128i s1 =
+          _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride));
+      const __m128i p0 = _mm_loadl_epi64((const __m128i *)pred_ptr);
+      const __m128i p1 =
+          _mm_loadl_epi64((const __m128i *)(pred_ptr + pred_stride));
+      const __m128i d0 = _mm_sub_epi16(s0, p0);
+      const __m128i d1 = _mm_sub_epi16(s1, p1);
+      _mm_storel_epi64((__m128i *)diff_ptr, d0);
+      _mm_storel_epi64((__m128i *)(diff_ptr + diff_stride), d1);
+      src_ptr += src_stride << 1;
+      pred_ptr += pred_stride << 1;
+      diff_ptr += diff_stride << 1;
+      j -= 2;
+    } while (j != 0);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/x86/sum_squares_sse2.c b/libvpx/vpx_dsp/x86/sum_squares_sse2.c
index 14f3b35c0..df6514b2c 100644
--- a/libvpx/vpx_dsp/x86/sum_squares_sse2.c
+++ b/libvpx/vpx_dsp/x86/sum_squares_sse2.c
@@ -33,7 +33,7 @@ uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
   } else {
     // Generic case
     int r = size;
-    const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+    const __m128i v_zext_mask_q = _mm_set_epi32(0, -1, 0, -1);
     __m128i v_acc_q = _mm_setzero_si128();
 
     assert(size % 8 == 0);
diff --git a/libvpx/vpx_dsp/x86/variance_avx2.c b/libvpx/vpx_dsp/x86/variance_avx2.c
index 9232acbfb..35925d590 100644
--- a/libvpx/vpx_dsp/x86/variance_avx2.c
+++ b/libvpx/vpx_dsp/x86/variance_avx2.c
@@ -590,17 +590,20 @@ static INLINE int sub_pix_var32xh(const uint8_t *src, int src_stride,
   return sum;
 }
 
-static unsigned int sub_pixel_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, int height, unsigned int *sse) {
+static int sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                       int x_offset, int y_offset,
+                                       const uint8_t *dst, int dst_stride,
+                                       int height, unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          NULL, 0, 0, height, sse);
 }
 
-static unsigned int sub_pixel_avg_variance32xh_avx2(
-    const uint8_t *src, int src_stride, int x_offset, int y_offset,
-    const uint8_t *dst, int dst_stride, const uint8_t *second_pred,
-    int second_stride, int height, unsigned int *sse) {
+static int sub_pixel_avg_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                           int x_offset, int y_offset,
+                                           const uint8_t *dst, int dst_stride,
+                                           const uint8_t *second_pred,
+                                           int second_stride, int height,
+                                           unsigned int *sse) {
   return sub_pix_var32xh(src, src_stride, x_offset, y_offset, dst, dst_stride,
                          second_pred, second_stride, 1, height, sse);
 }
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index a67c92aad..d6eb12da1 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -19,7 +19,7 @@
 static INLINE unsigned int add32x4_sse2(__m128i val) {
   val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
   val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
-  return _mm_cvtsi128_si32(val);
+  return (unsigned int)_mm_cvtsi128_si32(val);
 }
 
 unsigned int vpx_get_mb_ss_sse2(const int16_t *src_ptr) {
@@ -85,7 +85,7 @@ static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
   vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
   vsum = _mm_unpacklo_epi16(vsum, vsum);
   vsum = _mm_srai_epi32(vsum, 16);
-  *sum = add32x4_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
 }
 
 static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
@@ -97,7 +97,7 @@ static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
 // Can handle 1024 pixels' diff sum (such as 32x32)
 static INLINE int sum_final_sse2(const __m128i sum) {
   const __m128i t = sum_to_32bit_sse2(sum);
-  return add32x4_sse2(t);
+  return (int)add32x4_sse2(t);
 }
 
 static INLINE void variance4_sse2(const uint8_t *src_ptr, const int src_stride,
@@ -349,7 +349,7 @@ unsigned int vpx_variance32x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -369,7 +369,7 @@ unsigned int vpx_variance64x32_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 11);
 }
 
@@ -389,7 +389,7 @@ unsigned int vpx_variance64x64_sse2(const uint8_t *src_ptr, int src_stride,
     vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));
   }
   *sse = add32x4_sse2(vsse);
-  sum = add32x4_sse2(vsum);
+  sum = (int)add32x4_sse2(vsum);
   return *sse - (unsigned int)(((int64_t)sum * sum) >> 12);
 }
 
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 0cbd151dc..21a35ae3c 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -485,7 +485,7 @@ static void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
     // Saturate and convert to 8-bit words
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
 
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -589,8 +589,8 @@ static void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr,
     res_reg_0123 = _mm_packus_epi16(res_reg_0123_lo, reg_zero);
 
     // Save only half of the register (8 words)
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(res_reg_m1012);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(res_reg_0123);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 6f2983a4b..c7d880860 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -227,6 +227,9 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = _mm256_unpackhi_epi8(s32b[4], s32b[5]);
   }
 
+  // The output_height is always a multiple of two.
+  assert(!(output_height & 1));
+
   for (i = output_height; i > 1; i -= 2) {
     __m256i srcRegHead2, srcRegHead3;
 
@@ -282,35 +285,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
     s2[2] = s2[3];
     srcRegHead1 = srcRegHead3;
   }
-
-  // if the number of strides is odd.
-  // process only 16 bytes
-  if (i > 0) {
-    // load the last 16 bytes
-    const __m128i srcRegHead2 =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-
-    // merge the last 2 results together
-    s1[0] = _mm256_castsi128_si256(
-        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-    s2[0] = _mm256_castsi128_si256(
-        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcRegHead1), srcRegHead2));
-
-    outReg1 = convolve8_8_avx2(s1, f);
-    outReg2 = convolve8_8_avx2(s2, f);
-
-    // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
-    // contain the first and second convolve result respectively
-    outReg1 = _mm_packus_epi16(outReg1, outReg2);
-
-    // average if necessary
-    if (avg) {
-      outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
-    }
-
-    // save 16 bytes
-    _mm_store_si128((__m128i *)output_ptr, outReg1);
-  }
 }
 
 static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
@@ -798,7 +772,7 @@ static void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr,
 
     // Pack to 8-bits
     dst = _mm_packus_epi16(dst, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst);
   }
 }
 
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index ed46d6245..4ea2752d3 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -580,7 +580,7 @@ static void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr,
 
     // Pack to 8-bits
     dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128());
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(dst_first);
 
     src_ptr += src_stride;
     dst_ptr += dst_stride;
@@ -666,8 +666,8 @@ static void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr,
     reg_1 = _mm_packus_epi16(reg_1, reg_1);
 
     // Save the result
-    *((uint32_t *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
-    *((uint32_t *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
+    *((int *)(dst_ptr)) = _mm_cvtsi128_si32(reg_0);
+    *((int *)(dst_ptr + dst_stride)) = _mm_cvtsi128_si32(reg_1);
 
     // Update the source by two rows
     src_ptr += src_stride_unrolled;
diff --git a/libvpx/vpx_ports/compiler_attributes.h b/libvpx/vpx_ports/compiler_attributes.h
index 354352016..4b468749b 100644
--- a/libvpx/vpx_ports/compiler_attributes.h
+++ b/libvpx/vpx_ports/compiler_attributes.h
@@ -29,13 +29,23 @@
 #endif  // __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
 
 #if defined(__clang__) && __has_attribute(no_sanitize)
+// Both of these have defined behavior and are used in certain operations or
+// optimizations thereof. There are cases where an overflow may be unintended,
+// however, so use of these attributes should be done with care.
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK \
   __attribute__((no_sanitize("unsigned-integer-overflow")))
-#endif
+#if __clang_major__ >= 12
+#define VPX_NO_UNSIGNED_SHIFT_CHECK \
+  __attribute__((no_sanitize("unsigned-shift-base")))
+#endif  // __clang__ >= 12
+#endif  // __clang__
 
 #ifndef VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #define VPX_NO_UNSIGNED_OVERFLOW_CHECK
 #endif
+#ifndef VPX_NO_UNSIGNED_SHIFT_CHECK
+#define VPX_NO_UNSIGNED_SHIFT_CHECK
+#endif
 
 //------------------------------------------------------------------------------
 // Variable attributes.
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index 7eff97b13..61672acad 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -524,9 +524,12 @@ static const arg_def_t row_mt =
 
 static const arg_def_t disable_loopfilter =
     ARG_DEF(NULL, "disable-loopfilter", 1,
-            "Control Loopfilter in VP9\n"
+            "Control Loopfilter in VP9:\n"
+            "                                          "
             "0: Loopfilter on for all frames (default)\n"
+            "                                          "
             "1: Loopfilter off for non reference frames\n"
+            "                                          "
             "2: Loopfilter off for all frames");
 #endif
 
diff --git a/libvpx/webmdec.h b/libvpx/webmdec.h
index d8618b07d..6ae7ee16d 100644
--- a/libvpx/webmdec.h
+++ b/libvpx/webmdec.h
@@ -27,7 +27,7 @@ struct WebmInputContext {
   const void *block;
   int block_frame_index;
   int video_track_index;
-  uint64_t timestamp_ns;
+  int64_t timestamp_ns;
   int is_key_frame;
   int reached_eos;
 };
diff --git a/libvpx/y4menc.c b/libvpx/y4menc.c
index 02b729e5b..187798127 100644
--- a/libvpx/y4menc.c
+++ b/libvpx/y4menc.c
@@ -17,39 +17,34 @@ int y4m_write_file_header(char *buf, size_t len, int width, int height,
   const char *color;
   switch (bit_depth) {
     case 8:
-      color = fmt == VPX_IMG_FMT_I444
-                  ? "C444\n"
-                  : fmt == VPX_IMG_FMT_I422 ? "C422\n" : "C420jpeg\n";
+      color = fmt == VPX_IMG_FMT_I444   ? "C444\n"
+              : fmt == VPX_IMG_FMT_I422 ? "C422\n"
+                                        : "C420jpeg\n";
       break;
     case 9:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p9 XYSCSS=444P9\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
-                                              : "C420p9 XYSCSS=420P9\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p9 XYSCSS=444P9\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p9 XYSCSS=422P9\n"
+                                          : "C420p9 XYSCSS=420P9\n";
       break;
     case 10:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p10 XYSCSS=444P10\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
-                                              : "C420p10 XYSCSS=420P10\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p10 XYSCSS=444P10\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p10 XYSCSS=422P10\n"
+                                          : "C420p10 XYSCSS=420P10\n";
       break;
     case 12:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p12 XYSCSS=444P12\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
-                                              : "C420p12 XYSCSS=420P12\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p12 XYSCSS=444P12\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p12 XYSCSS=422P12\n"
+                                          : "C420p12 XYSCSS=420P12\n";
       break;
     case 14:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p14 XYSCSS=444P14\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
-                                              : "C420p14 XYSCSS=420P14\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p14 XYSCSS=444P14\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p14 XYSCSS=422P14\n"
+                                          : "C420p14 XYSCSS=420P14\n";
       break;
     case 16:
-      color = fmt == VPX_IMG_FMT_I44416
-                  ? "C444p16 XYSCSS=444P16\n"
-                  : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
-                                              : "C420p16 XYSCSS=420P16\n";
+      color = fmt == VPX_IMG_FMT_I44416   ? "C444p16 XYSCSS=444P16\n"
+              : fmt == VPX_IMG_FMT_I42216 ? "C422p16 XYSCSS=422P16\n"
+                                          : "C420p16 XYSCSS=420P16\n";
       break;
     default: color = NULL; assert(0);
   }
diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c
index 7d3c03a7f..745e2f1cd 100644
--- a/libvpx/y4minput.c
+++ b/libvpx/y4minput.c
@@ -21,12 +21,13 @@
 // Reads 'size' bytes from 'file' into 'buf' with some fault tolerance.
 // Returns true on success.
 static int file_read(void *buf, size_t size, FILE *file) {
-  const int kMaxRetries = 5;
-  int retry_count = 0;
-  int file_error;
+  const int kMaxTries = 5;
+  int try_count = 0;
+  int file_error = 0;
   size_t len = 0;
-  do {
+  while (!feof(file) && len < size && try_count < kMaxTries) {
     const size_t n = fread((uint8_t *)buf + len, 1, size - len, file);
+    ++try_count;
     len += n;
     file_error = ferror(file);
     if (file_error) {
@@ -39,13 +40,13 @@ static int file_read(void *buf, size_t size, FILE *file) {
         return 0;
       }
     }
-  } while (!feof(file) && len < size && ++retry_count < kMaxRetries);
+  }
 
   if (!feof(file) && len != size) {
     fprintf(stderr,
             "Error reading file: %u of %u bytes read,"
-            " error: %d, retries: %d, %d: %s\n",
-            (uint32_t)len, (uint32_t)size, file_error, retry_count, errno,
+            " error: %d, tries: %d, %d: %s\n",
+            (uint32_t)len, (uint32_t)size, file_error, try_count, errno,
             strerror(errno));
   }
   return len == size;
author	James Zern <jzern@google.com>	2023-03-01 18:58:46 -0800
committer	James Zern <jzern@google.com>	2023-03-01 19:16:38 -0800
commit	1b4678c67ec028c753928b70fdc10aaa5c78bffa (patch)
tree	68b49c6ec3b33474b68c406d45472a54d6f694e8
parent	6939824c0cf8321a1718973892371085f7b4edff (diff)
download	libvpx-1b4678c67ec028c753928b70fdc10aaa5c78bffa.tar.gz